diff -Nru hypre-2.11.2/AUTOTEST/autotest.sh hypre-2.13.0/AUTOTEST/autotest.sh --- hypre-2.11.2/AUTOTEST/autotest.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/autotest.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,283 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -# Setup -testing_dir=`cd ..; pwd` -autotest_dir="$testing_dir/AUTOTEST" -finished_dir="$testing_dir/AUTOTEST-FINISHED" -output_dir="$testing_dir/AUTOTEST-`date +%Y.%m.%d-%a`" -src_dir="$testing_dir/hypre/src" -remote_dir="test-hypre" -summary_file="SUMMARY.html" -summary_subject="Autotest Error Summary `date +%Y-%m-%d`" -email_list="rfalgout@llnl.gov, tzanio@llnl.gov, umyang@llnl.gov, schroder2@llnl.gov, oseikuffuor1@llnl.gov, wang84@llnl.gov, li50@llnl.gov" - -# Main loop -test_opts="" -while [ "$*" ] -do - case $1 in - -h|-help) - cat <" > $summary_file; - echo " " >> $summary_file; - echo "
"          >> $summary_file;
-         echo $summary_subject >> $summary_file
-
-
-         # all top-level tests with empty error files are reported as "passed",
-         # not including the cron autotest logs
-         echo ""         >> $summary_file;
-         echo "[PASSED]" >> $summary_file
-         for test in $( find . -maxdepth 1 -size 0 -name "*.err" ! -name "*cron*" )
-         do
-            testname=`basename $test .err`
-            echo "-${testname#machine-}" >> $summary_file
-         done
-
-         # active tests without a *-done file are reported as "pending"
-         echo ""          >> $summary_file;
-         echo "[PENDING]" >> $summary_file
-         for test in $( find . -name "autotest-*-start" )
-         do
-            testbase=`basename $test -start`
-            if [ ! -e $testbase-done ]; then
-               echo $testbase | sed {s/autotest//g} >> $output_dir/$summary_file
-            else
-               rm -f $autotest_dir/$testbase*
-            fi
-         done
-
-         # all top-level tests with non-empty error files are reported as "failed",
-         # including the cron autotest logs
-         echo ""         >> $summary_file;
-         echo "[FAILED]" >> $summary_file
-         for test in $( find . -maxdepth 1 ! -size 0 -name "*.err" )
-         do
-            testname=`basename $test .err`
-            for prefix in "machine-" "autotest-";
-            do
-               testname="${testname#$prefix}"
-            done
-            echo "-$testname" >> $summary_file
-         done
-
-         # keep a time stamp of last runs and report if more than 10 days
-         echo ""           >> $summary_file;
-         echo "[LAST RUN]" >> $summary_file
-         for test in $( find . -maxdepth 1 -name "autotest-*-done" )
-         do
-            testname=`basename $test -done`
-            testname="${testname#autotest-}"
-            touch $testing_dir/lastrun-$testname
-         done
-         for test in $( find $testing_dir -maxdepth 1 -name "lastrun-*" -atime +10 )
-         do
-            testdate=`ls -l $test | awk '{print $6" "$7" "$8}'`
-            testname=`basename $test`
-            testname="${testname#lastrun-}"
-            echo "-$testname  $testdate" >> $summary_file
-         done
-
-         # list all non-empty error files in todays output directory
-         echo ""              >> $summary_file;
-         echo "[ERROR FILES]" >> $summary_file
-         for test in $( find $output_dir ! -size 0 -name "*.err" | sort -r )
-         do
-            echo "$test" >> $summary_file
-         done
-
-         echo "
" >> $summary_file; - echo "" >> $summary_file; - - if [ "$1" = "-summary-email" ]; then - # send the email - ( - echo To: $email_list - echo Subject: $summary_subject - echo Content-Type: text/html - echo MIME-Version: 1.0 - - cat $summary_file - - ) | /usr/sbin/sendmail -t - fi - - if [ "$1" = "-summary-copy" ]; then - # copy output_dir files to the specified remote testing_dir - rem_finished_dir="$2/AUTOTEST-FINISHED" - scp -q -r * $rem_finished_dir - fi - - test_opts="" - break - ;; - - *) - test_opts="$test_opts $1" - shift - ;; - esac -done - -# Ensure that important directories exist -if [ -n "$test_opts" ]; then - cd $testing_dir - mkdir -p $autotest_dir - mkdir -p $finished_dir - cd $autotest_dir -fi - -# Run tests -for opt in $test_opts -do - # TODO: use a "-:" format to avoid this? - case $opt in - -tux[0-9]*-compilers) - host=`echo $opt | awk -F- '{print $2}'` - name="tux-compilers" - ;; - - -tux[0-9]*) - host=`echo $opt | awk -F- '{print $2}'` - name="tux" - ;; - - -mac) - host="parsol" - name="mac" - ;; - - *) - host=`echo $opt | awk -F- '{print $2}'` - name=$host - ;; - esac - - if [ ! -e autotest-$name-start ]; then - echo "Test [machine-$name] started at `date +%T` on `date +%D`" \ - >> autotest-$name-start - ./testsrc.sh $src_dir $host:$remote_dir/$host machine-$name.sh - echo "Test [machine-$name] finished at `date +%T` on `date +%D`" \ - >> autotest-$name-start - mv machine-$name.??? $finished_dir - touch autotest-$name-done - fi -done - -# Fix permissions -cd $testing_dir -ch_dirs="hypre $autotest_dir $finished_dir $output_dir" -for dir in $ch_dirs lastrun-* -do - if [ -e $dir ]; then - chmod -fR a+rX,ug+w,o-w $dir - # chgrp -fR hypre $dir - fi -done - -# move all but the last 10 autotest results into yearly subdirectories -files=`echo AUTOTEST-2*.*` -count=`echo $files | wc | awk '{print $2}'` -for i in $files -do - if [ $count -le 10 ]; then - break; - fi - dir=`echo $i | awk -F '.' '{print $1}'` - if [ ! -d $dir ]; then - mkdir $dir - chmod -fR a+rX,ug+w,o-w $dir - # chgrp -fR hypre $dir - fi - mv $i $dir/$i - count=`expr $count - 1` -done - diff -Nru hypre-2.11.2/AUTOTEST/basic.sh hypre-2.13.0/AUTOTEST/basic.sh --- hypre-2.11.2/AUTOTEST/basic.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/basic.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,106 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +testname=`basename $0 .sh` + +# Echo usage information +case $1 in + -h|-help) + cat < options for configure script + -mo: options for make script + -ro: call the run script with these options + -eo: call the examples script with these options + -h|-help prints this usage information and exits + + This script configures and compiles the source in {src_dir}, then optionally + runs driver and example tests. + + Example usage: $0 ../src -ro: -ij -sstruct + +EOF + exit + ;; +esac + +# Set src_dir +src_dir=`cd $1; pwd` +shift + +# Parse the rest of the command line +copts="" +mopts="" +ropts="" +eopts="" +while [ "$*" ] +do + case $1 in + -co:) + opvar="copts"; shift + ;; + -mo:) + opvar="mopts"; shift + ;; + -ro:) + opvar="ropts"; rset="yes"; shift + ;; + -eo:) + opvar="eopts"; eset="yes"; shift + ;; + *) + eval $opvar=\"\$$opvar $1\" + shift + ;; + esac +done + +# Setup +test_dir=`pwd` +output_dir=`pwd`/$testname.dir +rm -fr $output_dir +mkdir -p $output_dir + +# Configure +# NOTE: The use of 'eval' is needed to deal properly with nested quotes in argument lists +eval ./test.sh configure.sh $src_dir $copts +mv -f configure.??? $output_dir + +# Make +./test.sh make.sh $src_dir $mopts +mv -f make.??? $output_dir + +# Run +if [ -n "$rset" ]; then + ./test.sh run.sh $src_dir $ropts + mv -f run.??? $output_dir +fi + +# Examples +if [ -n "$eset" ]; then + ./test.sh examples.sh $src_dir $eopts + mv -f examples.??? $output_dir +fi + +# Echo to stderr all nonempty error files in $output_dir +for errfile in $( find $output_dir ! -size 0 -name "*.err" ) +do + echo $errfile >&2 +done + +# Clean up +( cd $src_dir; make distclean ) + diff -Nru hypre-2.11.2/AUTOTEST/basictest.sh hypre-2.13.0/AUTOTEST/basictest.sh --- hypre-2.11.2/AUTOTEST/basictest.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/basictest.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,105 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -testname=`basename $0 .sh` - -# Echo usage information -case $1 in - -h|-help) - cat < options for configure script - -mo: options for make script - -ro: call the run script with these options - -eo: call the examples script with these options - -h|-help prints this usage information and exits - - This script configures and compiles the source in {src_dir}, then optionally - runs driver and example tests. - - Example usage: $0 ../src -ro: -ij -sstruct - -EOF - exit - ;; -esac - -# Set src_dir -src_dir=`cd $1; pwd` -shift - -# Parse the rest of the command line -copts="" -mopts="" -ropts="" -eopts="" -while [ "$*" ] -do - case $1 in - -co:) - opvar="copts"; shift - ;; - -mo:) - opvar="mopts"; shift - ;; - -ro:) - opvar="ropts"; rset="yes"; shift - ;; - -eo:) - opvar="eopts"; eset="yes"; shift - ;; - *) - eval $opvar=\"\$$opvar $1\" - shift - ;; - esac -done - -# Setup -test_dir=`pwd` -output_dir=`pwd`/$testname.dir -rm -fr $output_dir -mkdir -p $output_dir - -# Configure -./test.sh configure.sh $src_dir $copts -mv -f configure.??? $output_dir - -# Make -./test.sh make.sh $src_dir $mopts -mv -f make.??? $output_dir - -# Run -if [ -n "$rset" ]; then - ./test.sh run.sh $src_dir $ropts - mv -f run.??? $output_dir -fi - -# Examples -if [ -n "$eset" ]; then - ./test.sh examples.sh $src_dir $eopts - mv -f examples.??? $output_dir -fi - -# Echo to stderr all nonempty error files in $output_dir -for errfile in $( find $output_dir ! -size 0 -name "*.err" ) -do - echo $errfile >&2 -done - -# Clean up -( cd $src_dir; make distclean ) - diff -Nru hypre-2.11.2/AUTOTEST/check-double.filters hypre-2.13.0/AUTOTEST/check-double.filters --- hypre-2.11.2/AUTOTEST/check-double.filters 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/check-double.filters 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,11 @@ /_hypre_utilities.h: /HYPRE_utilities.h: -/mpistubs.c +/utilities/general.h: +/utilities/gpuErrorCheck.h: +/utilities/gpuErrorCheck.c: +/utilities/gpuMem.h: +/utilities/hypre_nvtx.h: +/utilities/mpistubs.c +/seq_mv/gpukernels.h: +/seq_mv/seq_mv.h:.*cudaStream_t double-check diff -Nru hypre-2.11.2/AUTOTEST/check-int.filters hypre-2.13.0/AUTOTEST/check-int.filters --- hypre-2.11.2/AUTOTEST/check-int.filters 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/check-int.filters 2017-10-20 17:42:22.000000000 +0000 @@ -3,6 +3,14 @@ /hypre_printf.c: /_hypre_utilities.h: /HYPRE_utilities.h: +/utilities/general.h: +/utilities/gpuErrorCheck.h: +/utilities/gpuErrorCheck.c: +/utilities/gpuMem.h: +/utilities/hypre_nvtx.h: +/utilities/mpistubs.c +/seq_mv/gpukernels.h: +/seq_mv/seq_mv.h:.*cudaStream_t as long as too long long range interpolation diff -Nru hypre-2.11.2/AUTOTEST/check-mpi.filters hypre-2.13.0/AUTOTEST/check-mpi.filters --- hypre-2.11.2/AUTOTEST/check-mpi.filters 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/check-mpi.filters 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,10 @@ /HYPRE_config.h: /_hypre_utilities.h: /HYPRE_utilities.h: -/mpistubs.c: -/mpistubs.h: -/thread_mpistubs.c: -/thread_mpistubs.h: +/utilities/gpuMem.c: +/utilities/mpistubs.c: +/utilities/mpistubs.h: +/parcsr_mv/par_csr_matvec.c:.*MPI_PACK +/parcsr_mv/par_csr_matvec.c:.*MPI_HALO_EXC_SEND +/parcsr_mv/par_csr_matvec.c:.*MPI_HALO_EXC_RECV +/parcsr_mv/par_csr_matvec.c:.*MPI_UNPACK diff -Nru hypre-2.11.2/AUTOTEST/cmake.sh hypre-2.13.0/AUTOTEST/cmake.sh --- hypre-2.11.2/AUTOTEST/cmake.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/cmake.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,127 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +testname=`basename $0 .sh` + +drivers="ij sstruct struct ams_driver maxwell_unscaled sstruct_fac ij_mv struct_migrate" + +# Echo usage information +case $1 in + -h|-help) + cat < configuration options + -mo: make options + -ro: call the run script with these options + -eo: call the examples script with these options + -h|-help prints this usage information and exits + + This script uses cmake to configure and compile the source in {src_dir}, then + optionally runs driver and example tests. + + Example usage: $0 ../src -co -DCMAKE_BUILD_TYPE=Debug -ro: -ij + +EOF + exit + ;; +esac + +# Set src_dir +src_dir=`cd $1; pwd` +shift + +# Parse the rest of the command line +copts="" +mopts="" +ropts="" +eopts="" +while [ "$*" ] +do + case $1 in + -co:) + opvar="copts"; shift + ;; + -mo:) + opvar="mopts"; shift + ;; + -ro:) + opvar="ropts"; rset="yes"; shift + ;; + -eo:) + opvar="eopts"; eset="yes"; shift + ;; + *) + eval $opvar=\"\$$opvar $1\" + shift + ;; + esac +done + +# Setup +test_dir=`pwd` +output_dir=`pwd`/$testname.dir +rm -fr $output_dir +mkdir -p $output_dir +cd $src_dir +src_dir=`pwd` + +# Clean up the cmbuild directories (do it from src_dir as a precaution) +cd $src_dir +rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'` +rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'` + +# Clean up the previous install +cd $src_dir +rm -fr hypre + +# Configure +cd $src_dir/cmbuild +cmake $copts .. +make $mopts install + +# Make +cd $src_dir/test/cmbuild +cmake .. +make $mopts +mv -f $drivers .. + +cd $test_dir + +# Run +if [ -n "$rset" ]; then + ./test.sh run.sh $src_dir $ropts + mv -f run.??? $output_dir +fi + +# Examples +if [ -n "$eset" ]; then + ./test.sh examples.sh $src_dir $eopts + mv -f examples.??? $output_dir +fi + +# Echo to stderr all nonempty error files in $output_dir +for errfile in $( find $output_dir ! -size 0 -name "*.err" ) +do + echo $errfile >&2 +done + +# Clean up +cd $src_dir +rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'` +rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'` +rm -fr hypre +( cd $src_dir/test; rm -f $drivers; ./cleantest.sh ) + diff -Nru hypre-2.11.2/AUTOTEST/cmaketest.sh hypre-2.13.0/AUTOTEST/cmaketest.sh --- hypre-2.11.2/AUTOTEST/cmaketest.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/cmaketest.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,127 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -testname=`basename $0 .sh` - -drivers="ij sstruct struct ams_driver maxwell_unscaled sstruct_fac ij_mv struct_migrate" - -# Echo usage information -case $1 in - -h|-help) - cat < configuration options - -mo: make options - -ro: call the run script with these options - -eo: call the examples script with these options - -h|-help prints this usage information and exits - - This script uses cmake to configure and compile the source in {src_dir}, then - optionally runs driver and example tests. - - Example usage: $0 ../src -co -DCMAKE_BUILD_TYPE=Debug -ro: -ij - -EOF - exit - ;; -esac - -# Set src_dir -src_dir=`cd $1; pwd` -shift - -# Parse the rest of the command line -copts="" -mopts="" -ropts="" -eopts="" -while [ "$*" ] -do - case $1 in - -co:) - opvar="copts"; shift - ;; - -mo:) - opvar="mopts"; shift - ;; - -ro:) - opvar="ropts"; rset="yes"; shift - ;; - -eo:) - opvar="eopts"; eset="yes"; shift - ;; - *) - eval $opvar=\"\$$opvar $1\" - shift - ;; - esac -done - -# Setup -test_dir=`pwd` -output_dir=`pwd`/$testname.dir -rm -fr $output_dir -mkdir -p $output_dir -cd $src_dir -src_dir=`pwd` - -# Clean up the cmbuild directories (do it from src_dir as a precaution) -cd $src_dir -rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'` -rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'` - -# Clean up the previous install -cd $src_dir -rm -fr hypre - -# Configure -cd $src_dir/cmbuild -cmake $copts .. -make $mopts install - -# Make -cd $src_dir/test/cmbuild -cmake .. -make $mopts -mv -f $drivers .. - -cd $test_dir - -# Run -if [ -n "$rset" ]; then - ./test.sh run.sh $src_dir $ropts - mv -f run.??? $output_dir -fi - -# Examples -if [ -n "$eset" ]; then - ./test.sh examples.sh $src_dir $eopts - mv -f examples.??? $output_dir -fi - -# Echo to stderr all nonempty error files in $output_dir -for errfile in $( find $output_dir ! -size 0 -name "*.err" ) -do - echo $errfile >&2 -done - -# Clean up -cd $src_dir -rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'` -rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'` -rm -fr hypre -( cd $src_dir/test; rm -f $drivers; cleantest.sh ) - diff -Nru hypre-2.11.2/AUTOTEST/configure.sh hypre-2.13.0/AUTOTEST/configure.sh --- hypre-2.11.2/AUTOTEST/configure.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/configure.sh 2017-10-20 17:42:22.000000000 +0000 @@ -40,12 +40,13 @@ shift # Run configure +# NOTE: The use of 'eval' is needed to deal properly with nested quotes in argument lists cd $src_dir if [ "`uname -s`" = "AIX" ] then - nopoe ./configure $@ + eval nopoe ./configure $@ else - ./configure $@ + eval ./configure $@ fi # Save config.log, HYPRE_config.h and Makefile.config diff -Nru hypre-2.11.2/AUTOTEST/cronfile hypre-2.13.0/AUTOTEST/cronfile --- hypre-2.11.2/AUTOTEST/cronfile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/cronfile 1970-01-01 00:00:00.000000000 +0000 @@ -1,29 +0,0 @@ -# The first five entries on each line correspond respectively to: -# -# minute (0-56) -# hour (0-23) -# day of month (1-31) -# month (1-12) -# day of week (0-6)(0=Sunday) -# -# '*' means "every" and '*/m' means "every m-th" - -# Rob's crontab (on tux339) - -30 23 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -checkout >> autotest-tux-cron.out 2>> autotest-tux-cron.err -00 1 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -tux339 >> autotest-tux-cron.out 2>> autotest-tux-cron.err -00 6 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -summary-email - - -# Rob's crontab (on rzcereal2) - -40 0 * * * source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -checkout > autotest-rzcereal2-checkout.out 2>&1 -00 1 * * 2 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -vulcan >> autotest-vulcan-cron.out 2>> autotest-vulcan-cron.err -00 1 * * 3 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzmerl >> autotest-rzmerl-cron.out 2>> autotest-rzmerl-cron.err -00 3 * * 3 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzcereal3 >> autotest-rzcereal3-cron.out 2>> autotest-rzcereal3-cron.err -00 1 * * 4 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzzeus >> autotest-rzzeus-cron.out 2>> autotest-rzzeus-cron.err -00 5 * * * source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -summary-copy tux339:/usr/casc/hypre/test-hypre - -# Tzanio's crontab (on tux252) - -00 1 * * * cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -mac >> autotest-mac-cron.out 2>> autotest-mac-cron.err diff -Nru hypre-2.11.2/AUTOTEST/machine-mac.sh hypre-2.13.0/AUTOTEST/machine-mac.sh --- hypre-2.11.2/AUTOTEST/machine-mac.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-mac.sh 2017-10-20 17:42:22.000000000 +0000 @@ -51,16 +51,16 @@ ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED" co="--disable-fortran" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -renametest.sh basictest $output_dir/basictest-default +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro +./renametest.sh basic $output_dir/basic-default co="--enable-debug --disable-fortran" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--enable-debug +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--enable-debug co="--enable-bigint --disable-fortran" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--enable-bigint +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--enable-bigint # Test linking for different languages link_opts="all++" diff -Nru hypre-2.11.2/AUTOTEST/machine-rzmerl.sh hypre-2.13.0/AUTOTEST/machine-rzmerl.sh --- hypre-2.11.2/AUTOTEST/machine-rzmerl.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-rzmerl.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,80 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -testname=`basename $0 .sh` - -# Echo usage information -case $1 in - -h|-help) - cat <&2 -done diff -Nru hypre-2.11.2/AUTOTEST/machine-rztopaz.sh hypre-2.13.0/AUTOTEST/machine-rztopaz.sh --- hypre-2.11.2/AUTOTEST/machine-rztopaz.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-rztopaz.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,80 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +testname=`basename $0 .sh` + +# Echo usage information +case $1 in + -h|-help) + cat <&2 +done diff -Nru hypre-2.11.2/AUTOTEST/machine-rzzeus.sh hypre-2.13.0/AUTOTEST/machine-rzzeus.sh --- hypre-2.11.2/AUTOTEST/machine-rzzeus.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-rzzeus.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,79 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -testname=`basename $0 .sh` - -# Echo usage information -case $1 in - -h|-help) - cat <&2 -done diff -Nru hypre-2.11.2/AUTOTEST/machine-syrah.sh hypre-2.13.0/AUTOTEST/machine-syrah.sh --- hypre-2.11.2/AUTOTEST/machine-syrah.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-syrah.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,79 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +testname=`basename $0 .sh` + +# Echo usage information +case $1 in + -h|-help) + cat <&2 +done diff -Nru hypre-2.11.2/AUTOTEST/machine-tux-exlibs.sh hypre-2.13.0/AUTOTEST/machine-tux-exlibs.sh --- hypre-2.11.2/AUTOTEST/machine-tux-exlibs.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-tux-exlibs.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,67 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +testname=`basename $0 .sh` + +# Echo usage information +case $1 in + -h|-help) + cat <&2 +done diff -Nru hypre-2.11.2/AUTOTEST/machine-tux.sh hypre-2.13.0/AUTOTEST/machine-tux.sh --- hypre-2.11.2/AUTOTEST/machine-tux.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-tux.sh 2017-10-20 17:42:22.000000000 +0000 @@ -43,73 +43,99 @@ src_dir=`cd $1; pwd` shift +# Organizing the tests from "fast" to "slow" + +# Check for 'int', 'double', and 'MPI_' +./test.sh check-int.sh $src_dir +mv -f check-int.??? $output_dir +./test.sh check-double.sh $src_dir +mv -f check-double.??? $output_dir +./test.sh check-mpi.sh $src_dir +mv -f check-mpi.??? $output_dir + # Basic build and run tests mo="-j test" ro="-ams -ij -sstruct -struct" eo="" co="" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest-default - -co="--enable-debug" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: $eo -renametest.sh basictest $output_dir/basictest-debug1 - -co="--enable-debug --enable-global-partition" -RO="-fac" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $RO -eo: $eo -renametest.sh basictest $output_dir/basictest-debug2 - -co="--enable-debug CC=mpiCC" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo -renametest.sh basictest $output_dir/basictest-debug-cpp - -# co="--with-insure --enable-debug --with-print-errors" -# MO="test" -# test.sh basictest.sh $src_dir -co: $co -mo: $MO -ro: $ro -# renametest.sh basictest $output_dir/basictest--with-insure1 -# -# co="--with-insure --enable-debug --enable-global-partition" -# MO="test" -# test.sh basictest.sh $src_dir -co: $co -mo: $MO -ro: $ro -# renametest.sh basictest $output_dir/basictest--with-insure2 +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic-default -co="--enable-debug --with-print-errors" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind -renametest.sh basictest $output_dir/basictest--valgrind1 - -co="--enable-debug --enable-global-partition" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind -renametest.sh basictest $output_dir/basictest--valgrind2 +# Test linking for different languages (depends on previous compile test) +link_opts="all++ all77" +for opt in $link_opts +do + output_subdir=$output_dir/link$opt + mkdir -p $output_subdir + ./test.sh link.sh $src_dir $opt + mv -f link.??? $output_subdir +done co="--without-MPI" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--without-MPI +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--without-MPI co="--with-strict-checking" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--with-strict-checking +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--with-strict-checking + +co="--with-strict-checking --enable-global-partition" +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--with-strict-global co="--enable-shared" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--enable-shared +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--enable-shared -co="--enable-bigint --enable-debug" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: -bigint -renametest.sh basictest $output_dir/basictest--enable-bigint +co="--enable-debug --with-openmp" +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--enable-openmp + +co="--enable-debug" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: $eo +./renametest.sh basic $output_dir/basic-debug1 co="--enable-maxdim=4 --enable-debug" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: -maxdim -renametest.sh basictest $output_dir/basictest--enable-maxdim=4 +./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: -maxdim +./renametest.sh basic $output_dir/basic--enable-maxdim=4 co="--enable-complex --enable-maxdim=4 --enable-debug" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: -complex +./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: -complex # ignore complex compiler output for now -rm -fr basictest.dir/make.??? -grep -v make.err basictest.err > basictest.tmp -mv basictest.tmp basictest.err -renametest.sh basictest $output_dir/basictest--enable-complex +rm -fr basic.dir/make.??? +grep -v make.err basic.err > basic.tmp +mv basic.tmp basic.err +./renametest.sh basic $output_dir/basic--enable-complex + +co="--enable-debug --enable-global-partition" +RO="-fac" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $RO -eo: $eo +./renametest.sh basic $output_dir/basic-debug2 + +co="--enable-single --enable-debug" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: -single +./renametest.sh basic $output_dir/basic--enable-single + +co="--enable-longdouble --enable-debug" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: -longdouble +./renametest.sh basic $output_dir/basic--enable-longdouble + +co="--enable-debug CC=mpiCC" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo +./renametest.sh basic $output_dir/basic-debug-cpp + +co="--enable-bigint --enable-debug" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: -bigint +./renametest.sh basic $output_dir/basic--enable-bigint + +co="--enable-debug --with-print-errors" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind +./renametest.sh basic $output_dir/basic--valgrind1 + +co="--enable-debug --enable-global-partition" +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind +./renametest.sh basic $output_dir/basic--valgrind2 # CMake build and run tests mo="-j" @@ -117,50 +143,40 @@ eo="" co="" -test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -renametest.sh cmaketest $output_dir/cmaketest-default - -co="-DCMAKE_BUILD_TYPE=Debug" -test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -ro: $ro -renametest.sh cmaketest $output_dir/cmaketest-debug +./test.sh cmake.sh $src_dir -co: $co -mo: $mo +./renametest.sh cmake $output_dir/cmake-default co="-DHYPRE_NO_GLOBAL_PARTITION=OFF" -test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -renametest.sh cmaketest $output_dir/cmaketest-global-partition +./test.sh cmake.sh $src_dir -co: $co -mo: $mo +./renametest.sh cmake $output_dir/cmake-global-partition co="-DHYPRE_SEQUENTIAL=ON" -test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -renametest.sh cmaketest $output_dir/cmaketest-sequential +./test.sh cmake.sh $src_dir -co: $co -mo: $mo +./renametest.sh cmake $output_dir/cmake-sequential co="-DHYPRE_SHARED=ON" -test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -renametest.sh cmaketest $output_dir/cmaketest-shared +./test.sh cmake.sh $src_dir -co: $co -mo: $mo +./renametest.sh cmake $output_dir/cmake-shared + +co="-DHYPRE_SINGLE=ON" +./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: -single +./renametest.sh cmake $output_dir/cmake-single + +co="-DHYPRE_LONG_DOUBLE=ON" +./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: -longdouble +./renametest.sh cmake $output_dir/cmake-longdouble + +co="-DCMAKE_BUILD_TYPE=Debug" +./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: $ro +./renametest.sh cmake $output_dir/cmake-debug co="-DHYPRE_BIGINT=ON" -test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -ro: $ro -renametest.sh cmaketest $output_dir/cmaketest-bigint +./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: $ro +./renametest.sh cmake $output_dir/cmake-bigint # cmake build doesn't currently support maxdim # cmake build doesn't currently support complex -# Test linking for different languages -link_opts="all++ all77" -for opt in $link_opts -do - output_subdir=$output_dir/link$opt - mkdir -p $output_subdir - ./test.sh link.sh $src_dir $opt - mv -f link.??? $output_subdir -done - -# Check for 'int', 'double', and 'MPI_' -./test.sh check-int.sh $src_dir -mv -f check-int.??? $output_dir -./test.sh check-double.sh $src_dir -mv -f check-double.??? $output_dir -./test.sh check-mpi.sh $src_dir -mv -f check-mpi.??? $output_dir - # Echo to stderr all nonempty error files in $output_dir for errfile in $( find $output_dir ! -size 0 -name "*.err" ) do diff -Nru hypre-2.11.2/AUTOTEST/machine-vulcan.sh hypre-2.13.0/AUTOTEST/machine-vulcan.sh --- hypre-2.11.2/AUTOTEST/machine-vulcan.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/machine-vulcan.sh 2017-10-20 17:42:22.000000000 +0000 @@ -47,16 +47,16 @@ ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED" co="" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -renametest.sh basictest $output_dir/basictest-default +./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro +./renametest.sh basic $output_dir/basic-default co="--enable-debug" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--enable-debug +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--enable-debug co="--enable-bigint" -test.sh basictest.sh $src_dir -co: $co -mo: $mo -renametest.sh basictest $output_dir/basictest--enable-bigint +./test.sh basic.sh $src_dir -co: $co -mo: $mo +./renametest.sh basic $output_dir/basic--enable-bigint # Test linking for different languages link_opts="all++ all77" diff -Nru hypre-2.11.2/AUTOTEST/README.txt hypre-2.13.0/AUTOTEST/README.txt --- hypre-2.11.2/AUTOTEST/README.txt 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/README.txt 2017-10-20 17:42:22.000000000 +0000 @@ -1,12 +1,10 @@ This directory contains scripts for running various tests on the hypre library. -They are run automatically as part of hypre's regression testing, and they are -run manually to test new distributions of hypre before releasing them to the -public. The scripts augment the 'runtest.sh' runtime tests in 'test/TEST_*'. - -Every test in this directory may be run manually by developers without fear of -interfering with the auto-testing, as long as they are not run from within the -auto-testing directory (currently '/usr/casc/hypre/testing'). +The scripts augment the 'runtest.sh' runtime tests in 'test/TEST_*'. + +Every test in this directory may be run manually by developers. Many of the +scripts are also run as part of the nightly regression testing, currently +developed and maintained in a separate git repository called 'hypre/autotest'. ===================== @@ -17,20 +15,13 @@ represents an individual test written by a hypre developer. The special scripts are as follows (note that they are the only scripts with "test" in their names): -1. 'test.sh' - Used to run individual tests locally on a machine. -2. 'testsrc.sh' - Used to run individual tests on a remote machine. -3. 'testdist.sh' - Used to test a new distribution before release. -4. 'autotest.sh' - Usually run in an automatic fashion by 'cron', but may also - be run manually by developers (useful for debugging). +1. 'test.sh' - Used to run individual tests. +2. 'cleantest.sh' - Used to clean up the output from a test (or tests). +3. 'renametest.sh' - Used to rename the output from a test. Usage information for every script (special or individual test) can be obtained by running it with the '-h' option (e.g., 'test.sh -h' or 'make.sh -h'). -The file 'cronfile' encapsulates the current 'cron' entries for auto-testing. -It is possible (and probable) to have multiple developers running 'cron' jobs as -part of the overall auto-testing. This needs to be coordinated if the output -files are being written to the global auto-testing directory. - ===================== Writing tests: @@ -43,8 +34,7 @@ template and make the appropriate modifications. Try not to use the word "test" in the name of the script so that we can keep the convention of only the special scripts having this in their names. Try not to use absolute directory paths in -the script. If in doubt, talk to another developer or send an inquiry to -hypre-support@llnl.gov. +the script. ===================== @@ -52,7 +42,6 @@ - Minimal limitations on the types of tests that are possible. - Developers should be able to run the tests manually. -- Tests should be runable on both the repository and each release. - Minimal dependence on operating system and software tools (for portability). - Developers should be able to easily add new tests. - Simplicity and flexibility. diff -Nru hypre-2.11.2/AUTOTEST/testdist.sh hypre-2.13.0/AUTOTEST/testdist.sh --- hypre-2.11.2/AUTOTEST/testdist.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/testdist.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,174 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -# Which tests to run? -TEST_PATCH="-tux339" -TEST_MINOR="$TEST_PATCH -rzzeus -rzmerl -vulcan" -TEST_MAJOR="$TEST_MINOR" -TERMCMD="" - -while [ "$*" ] -do - case $1 in - -h|-help) - cat </dev/null 1>&2) then - rm -rf $release_dir $output_dir $autotest_dir/autotest-* - tar -zxf $release_file -fi -rm -rf $tmpdir -echo "" -echo "The following tests are needed to verify this $NAME release: $TESTS" -echo "" - -# List the status of the required tests -cd $autotest_dir -NOTRUN="" -FAILED="" -PENDING="" -for test in $TESTS -do - name=`echo $test | sed 's/[0-9]//g'` - # Determine failed, pending, passed and tests that have not been run - if [ -f $output_dir/machine$name.err ]; then - if [ -s $output_dir/machine$name.err ]; then - status="[FAILED] "; FAILED="$FAILED $test" - else - status="[PASSED] "; - fi - elif [ ! -e autotest$name-start ]; then - status="[NOT RUN]"; NOTRUN="$NOTRUN $test" - elif [ ! -e autotest$name-done ]; then - status="[PENDING]"; PENDING="$PENDING $test" - else - status="[UNKNOWN]"; - fi - if [ "$TERMCMD" == "" ]; then - echo "$status ./autotest.sh -dist $release $test" - else - echo "$status $TERMCMD ./autotest.sh -dist $release $test &" - fi -done - -# If all tests have been run, create a tarball of the log files -if [ "$NOTRUN$PENDING" == "" ]; then - echo ""; echo "Generating the verification file AUTOTEST-hypre-$release.tgz" - cd $testing_dir - mv -f $autotest_dir/autotest-* $output_dir - tar -zcf $autotest_dir/AUTOTEST-hypre-$release.tgz `basename $output_dir` -fi - -# If all tests have passed, print a message and exit -if [ "$NOTRUN$FAILED$PENDING" == "" ]; then - echo "The release is verified!" - exit -fi - -cat <> autotest$name.err - else - echo "Running test [$TERMCMD ./autotest.sh -dist $release $test &]" - $TERMCMD "./autotest.sh -dist $release $test 2>> autotest$name.err" 2>> autotest$name.err & - fi - echo "" - done -fi -echo "" -echo "Re-run the script after tests have completed to verify the release." -echo "" diff -Nru hypre-2.11.2/AUTOTEST/testsrc.sh hypre-2.13.0/AUTOTEST/testsrc.sh --- hypre-2.11.2/AUTOTEST/testsrc.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/AUTOTEST/testsrc.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,89 +0,0 @@ -#!/bin/sh -#BHEADER********************************************************************** -# Copyright (c) 2008, Lawrence Livermore National Security, LLC. -# Produced at the Lawrence Livermore National Laboratory. -# This file is part of HYPRE. See file COPYRIGHT for details. -# -# HYPRE is free software; you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License (as published by the Free -# Software Foundation) version 2.1 dated February 1999. -# -# $Revision$ -#EHEADER********************************************************************** - -while [ "$*" ] -do - case $1 in - -h|-help) - cat < Tue, 12 Dec 2017 03:47:42 +0800 + +hypre (2.13.0-1exp1) experimental; urgency=medium + + * Team upload. + * New upstream version. + - new multigrid reduction (MGR) solver + - updates AMG Hybrid solver with BoomerAMG + - now uses external SuperLU (remove debian superlu_internal.patch) + and SuperLU-Dist. + Build-Depends: libsuperlu-dev, libsuperlu-dist-dev + - regression: breaks build of distributed_ls/pilut. Fixed with + debian patch pilut_blas_mangle.patch to use external BLAS. + * libhypre-dev depends on optimised OpenBLAS or ATLAS to provide + blas and lapack, where available. + + -- Drew Parsons Fri, 24 Nov 2017 13:52:43 +0800 + +hypre (2.12.1-1exp1) experimental; urgency=medium + + * Team upload. + * New upstream version. + - adds GPU support (not yet activated in Debian) + - adds support for single and quad precision floating point numbers + + -- Drew Parsons Wed, 18 Oct 2017 19:17:17 +0800 + hypre (2.11.2-2) unstable; urgency=medium * Team upload. diff -Nru hypre-2.11.2/debian/control hypre-2.13.0/debian/control --- hypre-2.11.2/debian/control 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/control 2017-12-11 19:47:42.000000000 +0000 @@ -3,14 +3,15 @@ Priority: optional Maintainer: Debian Science Maintainers Uploaders: "Adam C. Powell, IV" -Standards-Version: 4.1.1 +Standards-Version: 4.1.2 Build-Depends: autoconf, automake, debhelper (>= 10), gfortran, - libblas-dev | libblas.so, - liblapack-dev | liblapack.so, + libblas-dev | libopenblas-dev | libatlas-base-dev | libblas.so, + liblapack-dev | libopenblas-dev | libatlas-base-dev | liblapack.so, + libsuperlu-dev, libsuperlu-dist-dev, libltdl-dev, libtool, mpi-default-dev, @@ -18,7 +19,7 @@ Vcs-Browser: https://anonscm.debian.org/git/debian-science/packages/hypre.git Homepage: http://www.llnl.gov/casc/hypre/ -Package: libhypre-2.11.2 +Package: libhypre-2.13.0 Architecture: any Multi-Arch: same Section: libs @@ -36,9 +37,9 @@ Architecture: any Section: libdevel Depends: - libblas-dev | libblas.so, - libhypre-2.11.2 (= ${binary:Version}), - liblapack-dev | liblapack.so, + libhypre-2.13.0 (= ${binary:Version}), + libopenblas-dev | libatlas-base-dev | libblas-dev | libblas.so, + libopenblas-dev | libatlas-base-dev | liblapack-dev | liblapack.so, libsuperlu-dev, mpi-default-dev, ${misc:Depends}, diff -Nru hypre-2.11.2/debian/patches/cmake-install.patch hypre-2.13.0/debian/patches/cmake-install.patch --- hypre-2.11.2/debian/patches/cmake-install.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/cmake-install.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,32 +0,0 @@ -Index: hypre-2.10.0b/src/CMakeLists.txt -=================================================================== ---- hypre-2.10.0b.orig/src/CMakeLists.txt -+++ hypre-2.10.0b/src/CMakeLists.txt -@@ -8,6 +8,10 @@ set (HYPRE_TIME 00:00:00) - set (HYPRE_BUGS hypre-support@llnl.gov) - set (HYPRE_SRCDIR "${PROJECT_SOURCE_DIR}") - -+set(HYPRE_MAJOR_VERSION 2) -+set(HYPRE_MINOR_VERSION 10) -+set(HYPRE_PATCH_VERSION 0b) -+ - if (${hypre_SOURCE_DIR} STREQUAL ${hypre_BINARY_DIR}) - message(FATAL_ERROR "In-place build not allowed! Please use a separate build directory. See the Users Manual or INSTALL file for details.") - endif () -@@ -778,9 +782,13 @@ if (HYPRE_USING_FEI) - endif () - - add_library (HYPRE ${HYPRE_SOURCES} ${FEI_LIBS}) -- --install (TARGETS HYPRE DESTINATION lib) --install (FILES ${HYPRE_HEADERS} DESTINATION include) -+if (BUILD_SHARED_LIBS) -+set_target_properties(HYPRE PROPERTIES -+ VERSION ${HYPRE_MAJOR_VERSION}.${HYPRE_MINOR_VERSION}.${HYPRE_PATCH_VERSION} -+ SOVERSION ${HYPRE_MAJOR_VERSION}) -+endif (BUILD_SHARED_LIBS) -+install (TARGETS HYPRE DESTINATION ${INSTALL_LIB_DIR}) -+install (FILES ${HYPRE_HEADERS} DESTINATION include/hypre) - - # add_subdirectory (test EXCLUDE_FROM_ALL) - diff -Nru hypre-2.11.2/debian/patches/cmake.patch hypre-2.13.0/debian/patches/cmake.patch --- hypre-2.11.2/debian/patches/cmake.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/cmake.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,45 +0,0 @@ -Index: hypre-2.9.1a/src/CMakeLists.txt -=================================================================== ---- hypre-2.9.1a.orig/src/CMakeLists.txt 2013-09-12 08:36:16.000000000 +0200 -+++ hypre-2.9.1a/src/CMakeLists.txt 2013-09-12 08:36:16.000000000 +0200 -@@ -772,7 +772,7 @@ - add_library (HYPRE ${HYPRE_SOURCES} ${FEI_LIBS}) - - install (TARGETS HYPRE DESTINATION lib) --install (FILES ${HYPRE_HEADERS} DESTINATION include) -+install (FILES ${HYPRE_HEADERS} DESTINATION include/hypre) - - # add_subdirectory (test EXCLUDE_FROM_ALL) - -Index: hypre-2.9.1a/src/FEI_mv/SuperLU/CMakeLists.txt -=================================================================== ---- hypre-2.9.1a.orig/src/FEI_mv/SuperLU/CMakeLists.txt 2012-09-25 00:59:53.000000000 +0200 -+++ hypre-2.9.1a/src/FEI_mv/SuperLU/CMakeLists.txt 2013-09-12 08:40:05.000000000 +0200 -@@ -59,4 +59,5 @@ - set_target_properties(HYPRE_superlu PROPERTIES COMPILE_FLAGS "-fPIC") - endif() - --install (FILES ${HYPRE_superlu_HEADERS} DESTINATION include) -+install (TARGETS HYPRE_superlu DESTINATION lib) -+install (FILES ${HYPRE_superlu_HEADERS} DESTINATION include/hypre) -Index: hypre-2.9.1a/src/FEI_mv/fei-hypre/CMakeLists.txt -=================================================================== ---- hypre-2.9.1a.orig/src/FEI_mv/fei-hypre/CMakeLists.txt 2012-09-25 02:13:39.000000000 +0200 -+++ hypre-2.9.1a/src/FEI_mv/fei-hypre/CMakeLists.txt 2013-09-12 08:40:41.000000000 +0200 -@@ -66,4 +66,5 @@ - set_target_properties(HYPRE_fei PROPERTIES COMPILE_FLAGS "-fPIC") - endif() - --install (FILES ${HYPRE_fei_HEADERS} DESTINATION include) -+install (TARGETS HYPRE_fei DESTINATION lib) -+install (FILES ${HYPRE_fei_HEADERS} DESTINATION include/hypre) -Index: hypre-2.9.1a/src/FEI_mv/femli/CMakeLists.txt -=================================================================== ---- hypre-2.9.1a.orig/src/FEI_mv/femli/CMakeLists.txt 2012-09-25 00:59:54.000000000 +0200 -+++ hypre-2.9.1a/src/FEI_mv/femli/CMakeLists.txt 2013-09-12 08:49:00.000000000 +0200 -@@ -46,3 +46,5 @@ - if(BUILD_SHARED_LIBS) - set_target_properties(HYPRE_mli PROPERTIES COMPILE_FLAGS "-fPIC") - endif() -+ -+install (TARGETS HYPRE_mli DESTINATION lib) diff -Nru hypre-2.11.2/debian/patches/complex.patch hypre-2.13.0/debian/patches/complex.patch --- hypre-2.11.2/debian/patches/complex.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/complex.patch 2017-12-11 19:47:42.000000000 +0000 @@ -27,7 +27,7 @@ =================================================================== --- hypre.orig/src/config/configure.in +++ hypre/src/config/configure.in -@@ -1259,6 +1259,12 @@ HYPRE_INSTALLDIR="${prefix}" +@@ -1698,6 +1698,12 @@ HYPRE_INSTALLDIR="${prefix}" HYPRE_LIBINSTALL="${libdir}" HYPRE_INCINSTALL="${includedir}/hypre" diff -Nru hypre-2.11.2/debian/patches/config-for-petsc.patch hypre-2.13.0/debian/patches/config-for-petsc.patch --- hypre-2.11.2/debian/patches/config-for-petsc.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/config-for-petsc.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -Index: hypre-2.10.0b/src/lib/Makefile -=================================================================== ---- hypre-2.10.0b.orig/src/lib/Makefile -+++ hypre-2.10.0b/src/lib/Makefile -@@ -53,9 +53,7 @@ LIBS_HYPRE = ../utilities/libHYPRE_utili - ../distributed_ls/Euclid/libHYPRE_Euclid.so \ - ../distributed_ls/ParaSails/libHYPRE_ParaSails.so \ - ../distributed_ls/pilut/libHYPRE_DistributedMatrixPilutSolver.so \ -- ../parcsr_ls/libHYPRE_parcsr_ls.so \ -- ../FEI_mv/femli/libHYPRE_mli.so \ -- ../FEI_mv/fei-hypre/libHYPRE_FEI.so -+ ../parcsr_ls/libHYPRE_parcsr_ls.so - - SONAME = libHYPRE-${HYPRE_RELEASE_VERSION}.so - SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS}\ diff -Nru hypre-2.11.2/debian/patches/config-update-dir.patch hypre-2.13.0/debian/patches/config-update-dir.patch --- hypre-2.11.2/debian/patches/config-update-dir.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/config-update-dir.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,19 +0,0 @@ -Change include paths so "aclocal -I ." works. - -Index: hypre/src/config/configure.in -=================================================================== ---- hypre.orig/src/config/configure.in -+++ hypre/src/config/configure.in -@@ -61,9 +61,9 @@ - m4_define([M4_HYPRE_BUGS], [hypre-support@llnl.gov]) - m4_define([M4_HYPRE_SRCDIR], [`pwd`]) - --m4_include([config/hypre_blas_macros.m4]) --m4_include([config/hypre_lapack_macros.m4]) --m4_include([config/hypre_macros_misc.m4]) -+m4_include([hypre_blas_macros.m4]) -+m4_include([hypre_lapack_macros.m4]) -+m4_include([hypre_macros_misc.m4]) - - AC_PREREQ(2.59) - AC_REVISION($Id: configure.in,v 1.101 2011/11/14 22:36:27 falgout Exp $) diff -Nru hypre-2.11.2/debian/patches/configure.patch hypre-2.13.0/debian/patches/configure.patch --- hypre-2.11.2/debian/patches/configure.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/configure.patch 2017-12-11 19:47:42.000000000 +0000 @@ -11,7 +11,7 @@ dnl ********************************************************************* dnl * Initialize some variables -@@ -161,19 +161,18 @@ hypre_lapack_lib_dir_old_style=no +@@ -169,19 +169,18 @@ hypre_lapack_lib_dir_old_style=no dnl ********************************************************************* dnl * Determine BUILD, HOST, and TARGET types dnl ********************************************************************* @@ -36,7 +36,7 @@ fi dnl ********************************************************************* -@@ -1258,7 +1257,7 @@ dnl * Set installation directories +@@ -1697,7 +1696,7 @@ dnl * Set installation directories dnl ********************************************************************* HYPRE_INSTALLDIR="${prefix}" HYPRE_LIBINSTALL="${libdir}" diff -Nru hypre-2.11.2/debian/patches/examples.patch hypre-2.13.0/debian/patches/examples.patch --- hypre-2.11.2/debian/patches/examples.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/examples.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,257 +0,0 @@ -Index: hypre-2.10.0b/src/examples/Makefile -=================================================================== ---- hypre-2.10.0b.orig/src/examples/Makefile -+++ hypre-2.10.0b/src/examples/Makefile -@@ -11,7 +11,7 @@ HYPRE_DIR = ../hypre - # Compiling and linking options - ######################################################################## - COPTS = -g -Wall --CINCLUDES = -I$(HYPRE_DIR)/include -+CINCLUDES = -I${HYPRE_INC_INSTALL} - CDEFS = -DHAVE_CONFIG_H -DHYPRE_TIMING - CFLAGS = $(COPTS) $(CINCLUDES) $(CDEFS) - FOPTS = -g -@@ -27,7 +27,27 @@ F90FLAGS = $(FFLAGS) $(IF90FLAGS) - - - LINKOPTS = $(COPTS) --LIBS = -L$(HYPRE_DIR)/lib -lHYPRE -lm -+LIBS = \ -+ -L${HYPRE_LIB_INSTALL} \ -+ -lHYPRE_utilities \ -+ -lHYPRE_multivector \ -+ -lHYPRE_krylov \ -+ -lHYPRE_struct_mv \ -+ -lHYPRE_struct_ls \ -+ -lHYPRE_sstruct_mv \ -+ -lHYPRE_sstruct_ls \ -+ -lHYPRE_seq_mv \ -+ -lHYPRE_parcsr_mv \ -+ -lHYPRE_parcsr_ls \ -+ -lHYPRE_parcsr_block_mv \ -+ -lHYPRE_DistributedMatrix \ -+ -lHYPRE_MatrixMatrix \ -+ -lHYPRE_IJ_mv \ -+ -lHYPRE_Euclid \ -+ -lHYPRE_ParaSails \ -+ -lHYPRE_DistributedMatrixPilutSolver \ -+ -lHYPRE \ -+ -lblas -llapack -lm - LFLAGS = $(LINKOPTS) $(LIBS) -lstdc++ - LFLAGS_B =\ - -L${HYPRE_DIR}/lib\ -@@ -37,7 +57,7 @@ LFLAGS_B =\ - -lbHYPRE\ - -lsidl -ldl -lxml2 - LFLAGS77 = $(LFLAGS) --LFLAGS90 = -+LFLAGS90 = $(LFLAGS) - - ######################################################################## - # Rules for compiling the source files -@@ -47,7 +67,7 @@ LFLAGS90 = - .c.o: - $(CC) $(CFLAGS) -c $< - .f.o: -- $(F77) $(FFLAGS) -c $< -+ $(F90) $(FFLAGS) -c $< - .cxx.o: - $(CXX) $(CXXFLAGS) -c $< - -@@ -65,9 +85,10 @@ LFLAGS90 = - ######################################################################## - # List of all programs to be compiled - ######################################################################## --ALLPROGS = ex1 ex2 ex3 ex4 ex5 ex5f ex6 ex7 ex8 ex9 ex10 ex11 ex12 ex12f \ -+ALLPROGS = ex1 ex2 ex3 ex4 ex5 ex5f ex6 ex7 ex8 ex9 ex11 ex12 ex12f \ - ex13 ex14 ex15 ex16 - BIGINTPROGS = ex5big ex15big -+FEIPROGS= ex10 - BABELPROGS = ex5b ex5b77 ex5bxx ex6b ex6b77 - # ... ex5bp and ex5b90 are not in BABELPROGS because they require a - # software environment which many people haven't set up. -@@ -89,59 +110,61 @@ maxdim: $(MAXDIMPROGS) - - complex: $(COMPLEXPROGS) - -+fei: $(FEIPROGS) -+ - ######################################################################## - # Example 1 - ######################################################################## - ex1: ex1.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 2 - ######################################################################## - ex2: ex2.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 3 - ######################################################################## - ex3: ex3.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 4 - ######################################################################## - ex4: ex4.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 5 - ######################################################################## - ex5: ex5.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 5 with 64-bit integers - ######################################################################## - ex5big: ex5big.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 5 Fortran 77 - ######################################################################## - ex5f: ex5f.o -- $(F77) -o $@ $^ $(LFLAGS77) -+ $(F90) -o $@ $^ $(LFLAGS90) - - ######################################################################## - # Example 5 Babel C - ######################################################################## - ex5b: ex5b.o -- $(CC) -o $@ $^ $(LFLAGS_B) $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS_B) $(LFLAGS) - - ######################################################################## - # Example 5 Babel Fortran 77 - ######################################################################## - ex5b77: ex5b77.o -- $(F77) -o $@ $^ $(LFLAGS_B) $(LFLAGS) -+ $(F90) -o $@ $^ $(LFLAGS_B) $(LFLAGS90) - - ######################################################################## - # Example 5 Babel Fortran 90 -@@ -153,97 +176,97 @@ ex5b90: ex5b90.o - # Example 5 Babel C++ - ######################################################################## - ex5bxx: ex5bxx.o -- $(CXX) -o $@ $^ $(LFLAGS_B) $(LFLAGS) -+ $(CXX) -o $@ $^ -Wl,--no-as-needed $(LFLAGS_B) $(LFLAGS) - - ######################################################################## - # Example 6 - ######################################################################## - ex6: ex6.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 6 Babel C - ######################################################################## - ex6b: ex6b.o -- $(CC) -o $@ $^ $(LFLAGS_B) $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS_B) $(LFLAGS) - - ######################################################################## - # Example 6 Babel Fortran 77 - ######################################################################## - ex6b77: ex6b77.o -- $(F77) -o $@ $^ $(LFLAGS_B) $(LFLAGS) -+ $(F90) -o $@ $^ $(LFLAGS_B) $(LFLAGS90) - - ######################################################################## - # Example 7 - ######################################################################## - ex7: ex7.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 8 - ######################################################################## - ex8: ex8.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 9 - ######################################################################## - ex9: ex9.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 10 - ######################################################################## - ex10: ex10.o -- $(CXX) -o $@ $^ $(LFLAGS) -+ $(CXX) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 11 - ######################################################################## - ex11: ex11.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 12 - ######################################################################## - ex12: ex12.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 12 Fortran 77 - ######################################################################## - ex12f: ex12f.o -- $(F77) -o $@ $^ $(LFLAGS77) -+ $(F90) -o $@ $^ $(LFLAGS) $(LFLAGS90) - - ######################################################################## - # Example 13 - ######################################################################## - ex13: ex13.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 14 - ######################################################################## - ex14: ex14.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 15 - ######################################################################## - ex15: ex15.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 15 with 64-bit integers - ######################################################################## - ex15big: ex15big.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 16 - ######################################################################## - ex16: ex16.o -- $(CC) -o $@ $^ $(LFLAGS) -+ $(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS) - - ######################################################################## - # Example 17 diff -Nru hypre-2.11.2/debian/patches/fgmres-lib.patch hypre-2.13.0/debian/patches/fgmres-lib.patch --- hypre-2.11.2/debian/patches/fgmres-lib.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/fgmres-lib.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,183 +0,0 @@ -Add libHYPRE_FEI_fgmres.[a|so] to resolve a circular dependency: -libHYPRE_FEI with fgmres.c depends on libHYPRE_mli.so which links with -symbols in fgmres.c . (Resolves issue 182.) - -Index: hypre-2.9.1a/src/config/configure.in -=================================================================== ---- hypre-2.9.1a.orig/src/config/configure.in 2013-09-12 17:33:00.000000000 +0200 -+++ hypre-2.9.1a/src/config/configure.in 2013-09-12 17:33:00.000000000 +0200 -@@ -914,7 +914,7 @@ - if test "$hypre_using_fei" = "yes" - then - HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv" -- HYPRE_FEI_SUBDIRS="femli fei-hypre" -+ HYPRE_FEI_SUBDIRS="fgmres femli fei-hypre" - HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o" - HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o" - if test "$hypre_using_superlu" = "yes" -Index: hypre-2.9.1a/src/FEI_mv/fei-hypre/Makefile -=================================================================== ---- hypre-2.9.1a.orig/src/FEI_mv/fei-hypre/Makefile 2013-09-12 17:33:00.000000000 +0200 -+++ hypre-2.9.1a/src/FEI_mv/fei-hypre/Makefile 2013-09-12 17:33:00.000000000 +0200 -@@ -113,7 +113,6 @@ - HYPRE_parcsr_TFQmr.c\ - HYPRE_parcsr_bicgs.c\ - HYPRE_parcsr_bicgstabl.c\ -- HYPRE_parcsr_fgmres.c\ - HYPRE_parcsr_lsicg.c\ - HYPRE_parcsr_symqmr.c\ - HYPRE_parcsr_maxwell.c\ -@@ -121,11 +120,12 @@ - TFQmr.c\ - bicgs.c\ - bicgstabl.c\ -- fgmres.c\ - hypre_lsi_amge.c\ - hypre_lsi_ddamg.c\ - hypre_lsi_misc.c\ - lsicg.c -+# HYPRE_parcsr_fgmres.c -+# fgmres.c - - FILESCXX = \ - FEI_HYPRE_Impl.cxx\ -Index: hypre-2.9.1a/src/FEI_mv/fgmres/Makefile -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ hypre-2.9.1a/src/FEI_mv/fgmres/Makefile 2013-09-12 17:33:00.000000000 +0200 -@@ -0,0 +1,135 @@ -+#BHEADER********************************************************************** -+# Copyright (c) 2006 The Regents of the University of California. -+# Produced at the Lawrence Livermore National Laboratory. -+# Written by the HYPRE team. UCRL-CODE-222953. -+# All rights reserved. -+# -+# This file is part of HYPRE (see http://www.llnl.gov/CASC/hypre/). -+# Please see the COPYRIGHT_and_LICENSE file for the copyright notice, -+# disclaimer, contact information and the GNU Lesser General Public License. -+# -+# HYPRE is free software; you can redistribute it and/or modify it under the -+# terms of the GNU General Public License (as published by the Free Software -+# Foundation) version 2.1 dated February 1999. -+# -+# HYPRE is distributed in the hope that it will be useful, but WITHOUT ANY -+# WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS -+# FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General -+# Public License for more details. -+# -+# You should have received a copy of the GNU Lesser General Public License -+# along with this program; if not, write to the Free Software Foundation, -+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+# -+# $Revision: 2.17 $ -+#EHEADER********************************************************************** -+ -+# why do we create this variable, asked by Charles -+HYPRE_LIB_SUFFIX = .a -+ -+include ../../config/Makefile.config -+ -+BASE_DIR = ${HYPRE_FEI_BASE_DIR} -+ -+SUPERLU_INCLUDE = -I$(srcdir)/../SuperLU -+SUPERLU_LIB = -L$(srcdir)/../SuperLU -+ -+CINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include -+CXXINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include -+CDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -+CXXDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DMPICH_SKIP_MPICXX -+ -+C_COMPILE_FLAGS = \ -+ ${CDEFS}\ -+ -I../..\ -+ -I$(BASE_DIR)\ -+ -I$(srcdir)\ -+ -I$(srcdir)/../..\ -+ -I$(srcdir)/../../IJ_mv\ -+ -I$(srcdir)/../../utilities\ -+ -I$(srcdir)/../../multivector\ -+ -I$(srcdir)/../../krylov\ -+ -I$(srcdir)/../../parcsr_mv\ -+ -I$(srcdir)/../../parcsr_ls\ -+ -I$(srcdir)/../../seq_mv\ -+ -I$(srcdir)/../../distributed_matrix\ -+ -I$(srcdir)/../../distributed_ls\ -+ -I$(srcdir)/../fei-base\ -+ ${SUPERLU_INCLUDE}\ -+ ${CINCLUDES} -+ -+CXX_COMPILE_FLAGS = \ -+ ${CXXDEFS}\ -+ -I../..\ -+ -I$(BASE_DIR)\ -+ -I$(srcdir)/../..\ -+ -I$(srcdir)/../../IJ_mv\ -+ -I$(srcdir)/../../utilities\ -+ -I$(srcdir)/../../multivector\ -+ -I$(srcdir)/../../krylov\ -+ -I$(srcdir)/../../parcsr_mv\ -+ -I$(srcdir)/../../parcsr_ls\ -+ -I$(srcdir)/../../seq_mv\ -+ -I$(srcdir)/../../distributed_matrix\ -+ -I$(srcdir)/../../distributed_ls\ -+ -I$(srcdir)/../femli\ -+ -I$(srcdir)/../fei-base\ -+ ${SUPERLU_INCLUDE}\ -+ ${CXXINCLUDES} -+ -+HYPRE_LIBS = \ -+ ../../utilities/libHYPRE_utilities.so \ -+ ../../parcsr_ls/libHYPRE_parcsr_ls.so -+# ../../krylov/libHYPRE_krylov.so -+# ../../seq_mv/libHYPRE_seq_mv.so -+# ../../parcsr_mv/libHYPRE_parcsr_mv.so -+# ../../IJ_mv/libHYPRE_IJ_mv.so -+ -+SYSTEM_LIBS = -lm -+ -+HEADERS = -+ -+FILESC = \ -+ ../fei-hypre/HYPRE_parcsr_fgmres.c\ -+ ../fei-hypre/fgmres.c -+ -+FILESCXX = -+ -+OBJSC = ${FILESC:.c=.o} -+OBJSCXX = ${FILESCXX:.cxx=.o} -+OBJS = ${OBJSC} ${OBJSCXX} -+ -+################################################################## -+# Targets -+################################################################## -+ -+all: libHYPRE_FEI_fgmres${HYPRE_LIB_SUFFIX} -+ cp -fpd libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ -+install: libHYPRE_FEI_fgmres${HYPRE_LIB_SUFFIX} -+ cp -fpd libHYPRE* $(HYPRE_LIB_INSTALL) -+ @echo "" -+ -+clean: -+ rm -rf *.o libHYPRE* -+ rm -rf pchdir tca.map *inslog* -+ -+distclean: clean -+ -+################################################################## -+# Rules -+################################################################## -+libHYPRE_FEI_fgmres.a: ${OBJS} -+ @echo "Building $@ ... " -+ mv -f *.o ../fei-hypre/ -+ ${AR} $@ ${OBJS} -+ ${RANLIB} $@ -+ -+libHYPRE_FEI_fgmres.so: ${OBJS} ${HYPRE_LIBS} -+ @echo "Building $@ ... " -+ mv -f *.o ../fei-hypre/ -+ ${BUILD_CC_SHARED} $^ ${SYSTEM_LIBS} -o libHYPRE_FEI_fgmres-2.8.0b.so -Wl,-soname,libHYPRE_FEI_fgmres-2.8.0b.so \ -+ -Wl,-z,defs -+ ln -s libHYPRE_FEI_fgmres-2.8.0b.so $@ -+ -+${OBJS}: ${HEADERS} diff -Nru hypre-2.11.2/debian/patches/flexgmres-to-parcsr_ls.patch hypre-2.13.0/debian/patches/flexgmres-to-parcsr_ls.patch --- hypre-2.11.2/debian/patches/flexgmres-to-parcsr_ls.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/flexgmres-to-parcsr_ls.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -Move (HYPRE_)flexgmres.c from krylov to parcsr_ls to resolve another -circular dependency between those two libraries. - -Index: hypre-2.9.1a/src/krylov/Makefile -=================================================================== ---- hypre-2.9.1a.orig/src/krylov/Makefile 2013-09-12 17:32:50.000000000 +0200 -+++ hypre-2.9.1a/src/krylov/Makefile 2013-09-12 17:32:50.000000000 +0200 -@@ -39,17 +39,17 @@ - bicgstab.c\ - cgnr.c\ - gmres.c\ -- flexgmres.c\ - lgmres.c\ - HYPRE_bicgstab.c\ - HYPRE_cgnr.c\ - HYPRE_gmres.c\ - HYPRE_lgmres.c\ -- HYPRE_flexgmres.c\ - HYPRE_pcg.c\ - pcg.c\ - HYPRE_lobpcg.c\ - lobpcg.c -+# flexgmres.c -+# HYPRE_flexgmres.c - - OBJS = ${FILES:.c=.o} - -Index: hypre-2.9.1a/src/parcsr_ls/Makefile -=================================================================== ---- hypre-2.9.1a.orig/src/parcsr_ls/Makefile 2013-09-12 17:32:50.000000000 +0200 -+++ hypre-2.9.1a/src/parcsr_ls/Makefile 2013-09-12 17:32:50.000000000 +0200 -@@ -120,7 +120,9 @@ - block_tridiag.c\ - ams.c\ - ads.c\ -- ame.c -+ ame.c\ -+ flexgmres.c\ -+ HYPRE_flexgmres.c - - OBJS = ${FILES:.c=.o} - diff -Nru hypre-2.11.2/debian/patches/install-libs.patch hypre-2.13.0/debian/patches/install-libs.patch --- hypre-2.11.2/debian/patches/install-libs.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/install-libs.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,379 +0,0 @@ -Remove comments so all libraries install. - -Index: hypre/src/FEI_mv/fei-hypre/Makefile -=================================================================== ---- hypre.orig/src/FEI_mv/fei-hypre/Makefile -+++ hypre/src/FEI_mv/fei-hypre/Makefile -@@ -182,7 +182,7 @@ - cp -fpPR $(BASE_DIR)/base/Lookup.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(BASE_DIR)/base/LinearSystemCore.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(BASE_DIR)/fei_defs.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_FEI${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/cfei-hypre.h $(HYPRE_INC_INSTALL) -@@ -195,7 +195,7 @@ - cp -fpPR $(BASE_DIR)/base/Lookup.h $(HYPRE_INC_INSTALL) - cp -fpPR $(BASE_DIR)/base/LinearSystemCore.h $(HYPRE_INC_INSTALL) - cp -fpPR $(BASE_DIR)/fei_defs.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - @echo "" - - clean: -Index: hypre/src/FEI_mv/femli/lib/Makefile -=================================================================== ---- hypre.orig/src/FEI_mv/femli/lib/Makefile -+++ hypre/src/FEI_mv/femli/lib/Makefile -@@ -145,10 +145,10 @@ - ################################################################## - - all: libHYPRE_mli${HYPRE_LIB_SUFFIX} --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_mli${HYPRE_LIB_SUFFIX} --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - @echo " " - - clean: -Index: hypre/src/IJ_mv/Makefile -=================================================================== ---- hypre.orig/src/IJ_mv/Makefile -+++ hypre/src/IJ_mv/Makefile -@@ -69,12 +69,12 @@ - all: libHYPRE_IJ_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_IJ_mv.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_IJ_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_IJ_mv.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/distributed_ls/Euclid/Makefile -=================================================================== ---- hypre.orig/src/distributed_ls/Euclid/Makefile -+++ hypre/src/distributed_ls/Euclid/Makefile -@@ -105,10 +105,10 @@ - ################################################################## - - all: libHYPRE_Euclid${HYPRE_LIB_SUFFIX} --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_Euclid${HYPRE_LIB_SUFFIX} --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/distributed_ls/ParaSails/Makefile -=================================================================== ---- hypre.orig/src/distributed_ls/ParaSails/Makefile -+++ hypre/src/distributed_ls/ParaSails/Makefile -@@ -71,10 +71,10 @@ - ################################################################## - - all: libHYPRE_ParaSails${HYPRE_LIB_SUFFIX} --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_ParaSails${HYPRE_LIB_SUFFIX} --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/distributed_ls/pilut/Makefile -=================================================================== ---- hypre.orig/src/distributed_ls/pilut/Makefile -+++ hypre/src/distributed_ls/pilut/Makefile -@@ -63,11 +63,11 @@ - - all: libHYPRE_DistributedMatrixPilutSolver${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_DistributedMatrixPilutSolver${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/distributed_matrix/Makefile -=================================================================== ---- hypre.orig/src/distributed_matrix/Makefile -+++ hypre/src/distributed_matrix/Makefile -@@ -50,11 +50,11 @@ - - all: libHYPRE_DistributedMatrix${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/distributed_matrix.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_DistributedMatrix${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/distributed_matrix.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/krylov/Makefile -=================================================================== ---- hypre.orig/src/krylov/Makefile -+++ hypre/src/krylov/Makefile -@@ -67,13 +67,13 @@ - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/krylov.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/lobpcg.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: all - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/krylov.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/lobpcg.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/matrix_matrix/Makefile -=================================================================== ---- hypre.orig/src/matrix_matrix/Makefile -+++ hypre/src/matrix_matrix/Makefile -@@ -49,11 +49,11 @@ - - all: libHYPRE_MatrixMatrix${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_MatrixMatrix${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o lib* -Index: hypre/src/multivector/Makefile -=================================================================== ---- hypre.orig/src/multivector/Makefile -+++ hypre/src/multivector/Makefile -@@ -39,11 +39,11 @@ - - all: libHYPRE_multivector${HYPRE_LIB_SUFFIX} - cp -fpPR *.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_multivector${HYPRE_LIB_SUFFIX} - cp -fpPR *.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/parcsr_block_mv/Makefile -=================================================================== ---- hypre.orig/src/parcsr_block_mv/Makefile -+++ hypre/src/parcsr_block_mv/Makefile -@@ -75,7 +75,7 @@ - all: libHYPRE_parcsr_block_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/par_csr_block_matrix.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/csr_block_matrix.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE_* $(HYPRE_BUILD_DIR)/lib/. -+ cp -fpPR libHYPRE_* $(HYPRE_BUILD_DIR)/lib/. - - driver: driver.o libHYPRE_parcsr_block_mv${HYPRE_LIB_SUFFIX} - @echo "Linking" $@ "... " -@@ -104,7 +104,7 @@ - install: libHYPRE_parcsr_block_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/par_csr_block_matrix.h $(HYPRE_INC_INSTALL)/. - cp -fpPR $(srcdir)/csr_block_matrix.h $(HYPRE_INC_INSTALL)/. --# cp -fpPR libHYPRE_* $(HYPRE_LIB_INSTALL)/. -+ cp -fpPR libHYPRE_* $(HYPRE_LIB_INSTALL)/. - - clean: - rm -rf *.o libHYPRE_* -Index: hypre/src/parcsr_ls/Makefile -=================================================================== ---- hypre.orig/src/parcsr_ls/Makefile -+++ hypre/src/parcsr_ls/Makefile -@@ -155,12 +155,12 @@ - all: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_parcsr_ls.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_parcsr_ls.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/parcsr_mv/Makefile -=================================================================== ---- hypre.orig/src/parcsr_mv/Makefile -+++ hypre/src/parcsr_mv/Makefile -@@ -85,7 +85,7 @@ - all: libHYPRE_parcsr_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_parcsr_mv.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - driver: driver.o libHYPRE_parcsr_mv${HYPRE_LIB_SUFFIX} - @echo "Linking" $@ "... " -@@ -114,7 +114,7 @@ - install: libHYPRE_parcsr_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_parcsr_mv.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/seq_ls/pamg/Makefile -=================================================================== ---- hypre.orig/src/seq_ls/pamg/Makefile -+++ hypre/src/seq_ls/pamg/Makefile -@@ -90,8 +90,10 @@ - ################################################################## - - lib: libHYPRE_amg_ls${HYPRE_LIB_SUFFIX} -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - all: libHYPRE_amg_ls${HYPRE_LIB_SUFFIX} driver -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - install: all - -Index: hypre/src/seq_mv/Makefile -=================================================================== ---- hypre.orig/src/seq_mv/Makefile -+++ hypre/src/seq_mv/Makefile -@@ -55,12 +55,12 @@ - all: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/seq_mv.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/seq_mv.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/sstruct_ls/Makefile -=================================================================== ---- hypre.orig/src/sstruct_ls/Makefile -+++ hypre/src/sstruct_ls/Makefile -@@ -133,12 +133,12 @@ - all: libHYPRE_sstruct_ls${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_sstruct_ls.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_sstruct_ls${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_sstruct_ls.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/sstruct_mv/Makefile -=================================================================== ---- hypre.orig/src/sstruct_mv/Makefile -+++ hypre/src/sstruct_mv/Makefile -@@ -78,12 +78,12 @@ - all: libHYPRE_sstruct_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_sstruct_mv.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_sstruct_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_sstruct_mv.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/struct_ls/Makefile -=================================================================== ---- hypre.orig/src/struct_ls/Makefile -+++ hypre/src/struct_ls/Makefile -@@ -119,12 +119,12 @@ - all: libHYPRE_struct_ls${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_struct_ls.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_struct_ls${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_struct_ls.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/struct_mv/Makefile -=================================================================== ---- hypre.orig/src/struct_mv/Makefile -+++ hypre/src/struct_mv/Makefile -@@ -82,12 +82,12 @@ - all: libHYPRE_struct_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_struct_mv.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_struct_mv${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_struct_mv.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* -Index: hypre/src/utilities/Makefile -=================================================================== ---- hypre.orig/src/utilities/Makefile -+++ hypre/src/utilities/Makefile -@@ -73,13 +73,13 @@ - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/_hypre_utilities.h $(HYPRE_BUILD_DIR)/include - cp -fpPR $(srcdir)/fortran*.h $(HYPRE_BUILD_DIR)/include --# cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib -+ cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_utilities${HYPRE_LIB_SUFFIX} - cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/_hypre_utilities.h $(HYPRE_INC_INSTALL) - cp -fpPR $(srcdir)/fortran*.h $(HYPRE_INC_INSTALL) --# cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) -+ cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL) - - clean: - rm -f *.o libHYPRE* f2c.h *blas.h *lapack.h diff -Nru hypre-2.11.2/debian/patches/install.patch hypre-2.13.0/debian/patches/install.patch --- hypre-2.11.2/debian/patches/install.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/install.patch 2017-12-11 19:47:42.000000000 +0000 @@ -2,7 +2,7 @@ =================================================================== --- hypre.orig/src/FEI_mv/fei-hypre/Makefile +++ hypre/src/FEI_mv/fei-hypre/Makefile -@@ -182,7 +182,7 @@ all: libHYPRE_FEI${HYPRE_LIB_SUFFIX} +@@ -183,7 +183,7 @@ all: libHYPRE_FEI${HYPRE_LIB_SUFFIX} install: libHYPRE_FEI${HYPRE_LIB_SUFFIX} cp -fR $(srcdir)/cfei-hypre.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/cfei_hypre.h $(HYPRE_INC_INSTALL) @@ -11,7 +11,7 @@ cp -fR $(srcdir)/HYPRE_LinSysCore.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/HYPRE_FEI*.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/LLNL_FEI*.h $(HYPRE_INC_INSTALL) -@@ -193,6 +193,7 @@ install: libHYPRE_FEI${HYPRE_LIB_SUFFIX} +@@ -194,6 +194,7 @@ install: libHYPRE_FEI${HYPRE_LIB_SUFFIX} cp -fR $(BASE_DIR)/fei_bool.h $(HYPRE_BUILD_DIR)/include cp -fR $(BASE_DIR)/fei_defs.h $(HYPRE_BUILD_DIR)/include cp -fR $(BASE_DIR)/fei_mpi.h $(HYPRE_BUILD_DIR)/include @@ -23,7 +23,7 @@ =================================================================== --- hypre.orig/src/FEI_mv/femli/Makefile +++ hypre/src/FEI_mv/femli/Makefile -@@ -137,7 +137,7 @@ all: libHYPRE_mli${HYPRE_LIB_SUFFIX} +@@ -138,7 +138,7 @@ all: libHYPRE_mli${HYPRE_LIB_SUFFIX} # cp -fR libHYPRE* $(HYPRE_BUILD_DIR)/lib install: libHYPRE_mli${HYPRE_LIB_SUFFIX} @@ -62,7 +62,7 @@ =================================================================== --- hypre.orig/src/distributed_ls/ParaSails/Makefile +++ hypre/src/distributed_ls/ParaSails/Makefile -@@ -67,7 +67,7 @@ all: libHYPRE_ParaSails${HYPRE_LIB_SUFFI +@@ -69,7 +69,7 @@ all: libHYPRE_ParaSails${HYPRE_LIB_SUFFI # cp -fR libHYPRE* $(HYPRE_BUILD_DIR)/lib install: libHYPRE_ParaSails${HYPRE_LIB_SUFFIX} @@ -101,7 +101,7 @@ =================================================================== --- hypre.orig/src/krylov/Makefile +++ hypre/src/krylov/Makefile -@@ -69,7 +69,7 @@ install: all +@@ -71,7 +71,7 @@ install: all cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/krylov.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/lobpcg.h $(HYPRE_INC_INSTALL) @@ -153,7 +153,7 @@ =================================================================== --- hypre.orig/src/parcsr_ls/Makefile +++ hypre/src/parcsr_ls/Makefile -@@ -140,7 +140,7 @@ all: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFI +@@ -147,7 +147,7 @@ all: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFI install: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFIX} cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/_hypre_parcsr_ls.h $(HYPRE_INC_INSTALL) @@ -179,7 +179,7 @@ =================================================================== --- hypre.orig/src/seq_mv/Makefile +++ hypre/src/seq_mv/Makefile -@@ -59,7 +59,7 @@ all: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX} +@@ -63,7 +63,7 @@ all: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX} install: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX} cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/seq_mv.h $(HYPRE_INC_INSTALL) @@ -244,7 +244,7 @@ =================================================================== --- hypre.orig/src/utilities/Makefile +++ hypre/src/utilities/Makefile -@@ -81,7 +81,7 @@ install: libHYPRE_utilities${HYPRE_LIB_S +@@ -83,7 +83,7 @@ install: libHYPRE_utilities${HYPRE_LIB_S cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/_hypre_utilities.h $(HYPRE_INC_INSTALL) cp -fR $(srcdir)/fortran*.h $(HYPRE_INC_INSTALL) @@ -257,7 +257,7 @@ =================================================================== --- hypre.orig/src/Makefile +++ hypre/src/Makefile -@@ -187,23 +187,16 @@ checkpar: +@@ -188,23 +188,16 @@ checkpar: install: all @ \ echo "Installing hypre ..."; \ diff -Nru hypre-2.11.2/debian/patches/lapack.patch hypre-2.13.0/debian/patches/lapack.patch --- hypre-2.11.2/debian/patches/lapack.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/lapack.patch 2017-12-11 19:47:42.000000000 +0000 @@ -2,7 +2,7 @@ =================================================================== --- hypre.orig/src/FEI_mv/femli/Makefile +++ hypre/src/FEI_mv/femli/Makefile -@@ -35,6 +35,7 @@ MLI_INCLUDES = \ +@@ -36,6 +36,7 @@ MLI_INCLUDES = \ C_COMPILE_FLAGS =\ -DMLI_SUPERLU\ diff -Nru hypre-2.11.2/debian/patches/pilut_blas_mangle.patch hypre-2.13.0/debian/patches/pilut_blas_mangle.patch --- hypre-2.11.2/debian/patches/pilut_blas_mangle.patch 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/debian/patches/pilut_blas_mangle.patch 2017-12-11 19:47:42.000000000 +0000 @@ -0,0 +1,12 @@ +Index: hypre/src/distributed_ls/pilut/macros.h +=================================================================== +--- hypre.orig/src/distributed_ls/pilut/macros.h ++++ hypre/src/distributed_ls/pilut/macros.h +@@ -18,6 +18,7 @@ + + #include "../../utilities/general.h" + #include "../../utilities/fortran.h" ++#include "../../blas/_hypre_blas.h" + + /* + * macros.h diff -Nru hypre-2.11.2/debian/patches/series hypre-2.13.0/debian/patches/series --- hypre-2.11.2/debian/patches/series 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/series 2017-12-11 19:47:42.000000000 +0000 @@ -1,12 +1,9 @@ +pilut_blas_mangle.patch configure.patch install.patch -#examples.patch -##cmake-install.patch -#config-for-petsc.patch blas.patch complex.patch shlibs-interlink-libHYPRE.so.patch test.patch lapack.patch -superlu_internal.patch shlibs-interlink-sublibs.patch diff -Nru hypre-2.11.2/debian/patches/shlibs-interlink-libHYPRE.so.patch hypre-2.13.0/debian/patches/shlibs-interlink-libHYPRE.so.patch --- hypre-2.11.2/debian/patches/shlibs-interlink-libHYPRE.so.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/shlibs-interlink-libHYPRE.so.patch 2017-12-11 19:47:42.000000000 +0000 @@ -12,7 +12,7 @@ =================================================================== --- hypre.orig/src/lib/Makefile +++ hypre/src/lib/Makefile -@@ -59,6 +59,28 @@ $(UTILITIESFILES)\ +@@ -57,6 +57,27 @@ $(UTILITIESFILES)\ $(BLASFILES)\ $(LAPACKFILES) @@ -35,13 +35,12 @@ + ../distributed_ls/pilut/libHYPRE_DistributedMatrixPilutSolver.so \ + ../parcsr_ls/libHYPRE_parcsr_ls.so \ + ../FEI_mv/fei-hypre/libHYPRE_FEI.so \ -+ ../FEI_mv/femli/libHYPRE_mli.so \ -+ ../FEI_mv/SuperLU/SRC/libHYPRE_superlu.so ++ ../FEI_mv/femli/libHYPRE_mli.so + SONAME = libHYPRE-${HYPRE_RELEASE_VERSION}.so SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS}\ ${BLASLIBDIRS} ${BLASLIBS} ${LIBS} ${FLIBS} -@@ -71,7 +93,7 @@ SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPA +@@ -69,7 +90,7 @@ SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPA all: libHYPRE${HYPRE_LIB_SUFFIX} cp -fR libHYPRE* ${HYPRE_BUILD_DIR}/lib @@ -50,7 +49,7 @@ cp -fR libHYPRE* ${HYPRE_LIB_INSTALL} clean: -@@ -105,7 +127,7 @@ libHYPRE.a: ${FILES_HYPRE} +@@ -103,7 +124,7 @@ libHYPRE.a: ${FILES_HYPRE} ${AR} $@ $(UTILITIESFILES) $(BLASFILES) $(LAPACKFILES) ${RANLIB} $@ diff -Nru hypre-2.11.2/debian/patches/shlibs-interlink-sublibs.patch hypre-2.13.0/debian/patches/shlibs-interlink-sublibs.patch --- hypre-2.11.2/debian/patches/shlibs-interlink-sublibs.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/shlibs-interlink-sublibs.patch 2017-12-11 19:47:42.000000000 +0000 @@ -2,13 +2,12 @@ =================================================================== --- hypre.orig/src/FEI_mv/fei-hypre/Makefile +++ hypre/src/FEI_mv/fei-hypre/Makefile -@@ -63,6 +63,19 @@ CXX_COMPILE_FLAGS = \ - ${SUPERLU_INCLUDE}\ +@@ -64,6 +64,18 @@ CXX_COMPILE_FLAGS = \ + ${DSUPERLU_INCLUDE}\ ${CXXINCLUDES} + +HYPRE_LIBS = \ -+ ../SuperLU/SRC/libHYPRE_superlu.so \ + ../femli/libHYPRE_mli.so \ + ../../utilities/libHYPRE_utilities.so \ + ../../seq_mv/libHYPRE_seq_mv.so \ @@ -22,7 +21,7 @@ HEADERS =\ cfei-hypre.h\ cfei_hypre.h\ -@@ -211,9 +224,9 @@ libHYPRE_FEI.a: ${OBJS} +@@ -212,9 +224,9 @@ libHYPRE_FEI.a: ${OBJS} ${AR} $@ ${OBJS} ${RANLIB} $@ @@ -30,7 +29,7 @@ +libHYPRE_FEI.so: ${OBJS} ${HYPRE_LIBS} @echo "Building $@ ... " - ${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME} -+ ${BUILD_CC_SHARED} -o ${SONAME} $^ ${SYSTEM_LIBS} ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS} ++ ${BUILD_CC_SHARED} -o ${SONAME} $^ ${SYSTEM_LIBS} ${SUPERLU_LIBS} ${DSUPERLU_LIBS} ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS} ln -s ${SONAME} $@ ${OBJS}: ${HEADERS} @@ -38,12 +37,11 @@ =================================================================== --- hypre.orig/src/FEI_mv/femli/Makefile +++ hypre/src/FEI_mv/femli/Makefile -@@ -10,6 +10,17 @@ +@@ -10,6 +10,16 @@ # $Revision$ #EHEADER********************************************************************** +HYPRE_LIBS = \ -+ ../SuperLU/SRC/libHYPRE_superlu.so \ + ../../utilities/libHYPRE_utilities.so \ + ../../krylov/libHYPRE_krylov.so \ + ../../seq_mv/libHYPRE_seq_mv.so \ @@ -56,7 +54,7 @@ include ../../config/Makefile.config -@@ -156,9 +167,10 @@ libHYPRE_mli.a: ${OBJS} +@@ -157,9 +167,10 @@ libHYPRE_mli.a: ${OBJS} ${AR} $@ ${OBJS} ${RANLIB} $@ @@ -130,7 +128,7 @@ =================================================================== --- hypre.orig/src/distributed_ls/ParaSails/Makefile +++ hypre/src/distributed_ls/ParaSails/Makefile -@@ -23,6 +23,13 @@ C_COMPILE_FLAGS = \ +@@ -25,6 +25,13 @@ C_COMPILE_FLAGS = \ -I$(srcdir)/../../distributed_matrix\ ${CINCLUDES} @@ -144,7 +142,7 @@ HEADERS =\ Common.h\ ConjGrad.h\ -@@ -84,9 +91,9 @@ libHYPRE_ParaSails.a: ${OBJS} +@@ -86,9 +93,9 @@ libHYPRE_ParaSails.a: ${OBJS} ${AR} $@ ${OBJS} ${RANLIB} $@ @@ -214,7 +212,7 @@ =================================================================== --- hypre.orig/src/krylov/Makefile +++ hypre/src/krylov/Makefile -@@ -22,6 +22,10 @@ C_COMPILE_FLAGS = \ +@@ -24,6 +24,10 @@ C_COMPILE_FLAGS = \ -I$(srcdir)/../utilities\ ${CINCLUDES} @@ -225,7 +223,7 @@ HEADERS =\ HYPRE_krylov.h\ krylov.h\ -@@ -86,9 +90,9 @@ libHYPRE_krylov.a: ${OBJS} +@@ -88,9 +92,9 @@ libHYPRE_krylov.a: ${OBJS} ${AR} $@ ${OBJS} ${RANLIB} $@ @@ -319,7 +317,7 @@ =================================================================== --- hypre.orig/src/parcsr_ls/Makefile +++ hypre/src/parcsr_ls/Makefile -@@ -33,6 +33,22 @@ C_COMPILE_FLAGS =\ +@@ -34,6 +34,22 @@ C_COMPILE_FLAGS =\ -I$(srcdir)/../parcsr_block_mv\ ${CINCLUDES} @@ -342,7 +340,7 @@ HEADERS =\ HYPRE_parcsr_ls.h\ _hypre_parcsr_ls.h\ -@@ -158,9 +174,9 @@ libHYPRE_parcsr_ls.a: ${OBJS} +@@ -165,9 +181,9 @@ libHYPRE_parcsr_ls.a: ${OBJS} ${AR} $@ ${OBJS} ${RANLIB} $@ @@ -386,12 +384,12 @@ =================================================================== --- hypre.orig/src/seq_mv/Makefile +++ hypre/src/seq_mv/Makefile -@@ -76,9 +76,9 @@ libHYPRE_seq_mv.a: ${OBJS} +@@ -80,9 +80,9 @@ libHYPRE_seq_mv.a: ${OBJS} ${CUOBJS} ${AR} $@ ${OBJS} ${RANLIB} $@ --libHYPRE_seq_mv.so: ${OBJS} -+libHYPRE_seq_mv.so: ${OBJS} ../utilities/libHYPRE_utilities.so +-libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS} ++libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS} ../utilities/libHYPRE_utilities.so @echo "Building $@ ... " - ${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME} + ${BUILD_CC_SHARED} -o ${SONAME} $^ ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS} @@ -518,7 +516,7 @@ =================================================================== --- hypre.orig/src/utilities/Makefile +++ hypre/src/utilities/Makefile -@@ -100,7 +100,7 @@ libHYPRE_utilities.a: ${OBJS} +@@ -102,7 +102,7 @@ libHYPRE_utilities.a: ${OBJS} libHYPRE_utilities.so: ${OBJS} @echo "Building $@ ... " diff -Nru hypre-2.11.2/debian/patches/superlu_internal.patch hypre-2.13.0/debian/patches/superlu_internal.patch --- hypre-2.11.2/debian/patches/superlu_internal.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/superlu_internal.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,106 +0,0 @@ -Index: hypre/src/FEI_mv/SuperLU/SRC/Makefile -=================================================================== ---- hypre.orig/src/FEI_mv/SuperLU/SRC/Makefile -+++ hypre/src/FEI_mv/SuperLU/SRC/Makefile -@@ -55,6 +55,9 @@ HEADERS =\ - slu_ddefs.h\ - slu_util.h\ - supermatrix.h -+ -+SYSTEM_LIBS = ${BLASLIBDIRS} ${BLASLIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS} -lm -+ - ####################################### - - ### LAPACK -@@ -153,13 +156,15 @@ FILES =\ - - OBJS = ${FILES:.c=.o} - -+SONAME = libHYPRE_superlu-${HYPRE_RELEASE_VERSION}.so -+ - all: libHYPRE_superlu${HYPRE_LIB_SUFFIX} - cp -fp *.h $(HYPRE_BUILD_DIR)/include - # cp -fp libHYPRE* $(HYPRE_BUILD_DIR)/lib - - install: libHYPRE_superlu${HYPRE_LIB_SUFFIX} -- cp -f *.h $(HYPRE_INC_INSTALL) --# cp -f libHYPRE* $(HYPRE_LIB_INSTALL) -+# cp -fR *.h $(HYPRE_INC_INSTALL) -+ cp -fR libHYPRE* $(HYPRE_LIB_INSTALL) - @echo " " - - clean: -@@ -174,7 +179,8 @@ libHYPRE_superlu.a: ${OBJS} - - libHYPRE_superlu.so: ${OBJS} - @echo "Building $@ ... " -- ${BUILD_CC_SHARED} -o $@ ${OBJS} -+ ${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SYSTEM_LIBS} ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS} -+ ln -s ${SONAME} $@ - - ${OBJS}: ${HEADERS} - -Index: hypre/src/config/configure.in -=================================================================== ---- hypre.orig/src/config/configure.in -+++ hypre/src/config/configure.in -@@ -1045,13 +1045,6 @@ then - HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv" - HYPRE_FEI_SUBDIRS="fei-hypre" - HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o" -- if test "$hypre_using_superlu" = "yes" -- then -- HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS" -- HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o" -- else -- HYPRE_FEI_SUPERLU_FILES= -- fi - if test "$hypre_using_mli" = "yes" - then - HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS" -@@ -1059,6 +1052,13 @@ then - else - HYPRE_FEI_FEMLI_FILES= - fi -+ if test "$hypre_using_superlu" = "yes" -+ then -+ HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS" -+ HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o" -+ else -+ HYPRE_FEI_SUPERLU_FILES= -+ fi - if test "$hypre_user_chose_fei" = "no" - then - HYPRE_FEI_BASE_DIR="$HYPRE_SRCDIR/FEI_mv/fei-base" -Index: hypre/src/configure -=================================================================== ---- hypre.orig/src/configure -+++ hypre/src/configure -@@ -6731,13 +6731,6 @@ then - HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv" - HYPRE_FEI_SUBDIRS="fei-hypre" - HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o" -- if test "$hypre_using_superlu" = "yes" -- then -- HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS" -- HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o" -- else -- HYPRE_FEI_SUPERLU_FILES= -- fi - if test "$hypre_using_mli" = "yes" - then - HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS" -@@ -6745,6 +6738,13 @@ then - else - HYPRE_FEI_FEMLI_FILES= - fi -+ if test "$hypre_using_superlu" = "yes" -+ then -+ HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS" -+ HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o" -+ else -+ HYPRE_FEI_SUPERLU_FILES= -+ fi - if test "$hypre_user_chose_fei" = "no" - then - HYPRE_FEI_BASE_DIR="$HYPRE_SRCDIR/FEI_mv/fei-base" diff -Nru hypre-2.11.2/debian/patches/test.patch hypre-2.13.0/debian/patches/test.patch --- hypre-2.11.2/debian/patches/test.patch 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/patches/test.patch 2017-12-11 19:47:42.000000000 +0000 @@ -224,7 +224,7 @@ =================================================================== --- hypre.orig/src/test/Makefile +++ hypre/src/test/Makefile -@@ -44,7 +44,27 @@ BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS +@@ -46,7 +46,27 @@ BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS LIBFLAGS = ${LDFLAGS} ${LIBS} LFLAGS =\ diff -Nru hypre-2.11.2/debian/rules hypre-2.13.0/debian/rules --- hypre-2.11.2/debian/rules 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/rules 2017-12-11 19:47:42.000000000 +0000 @@ -2,13 +2,13 @@ # Made with the aid of debmake, by Christoph Lameter, # based on the sample debian/rules file for GNU hello by Ian Jackson. -export HYPRE_SOVERSION=2.11.2 +export HYPRE_SOVERSION=2.13.0 export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic -O3 export DEB_CXXFLAGS_MAINT_APPEND = -Wall -pedantic -O3 export DEB_LDFLAGS_MAINT_APPEND = -Wl,--no-as-needed -export DEB_HOST_MULTIARCH := $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) -export DEB_HOST_ARCH := $(shell dpkg-architecture -qDEB_HOST_ARCH) +export DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) +export DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) %: dh $@ --sourcedirectory=src --with autoreconf @@ -22,8 +22,9 @@ # extra flags set to be similar to what petsc requires extra_flags += \ --with-fei --with-mli \ - --with-superlu \ - --with-blas \ + --with-superlu --with-superlu-include=/usr/include/superlu --with-superlu-lib="-lsuperlu" \ + --with-dsuperlu --with-dsuperlu-include=/usr/include/superlu-dist --with-dsuperlu-lib="-lsuperlu_dist" \ + --with-blas --with-blas-lib="-lblas" \ --with-lapack-libs="lapack" --with-lapack-lib-dirs="/usr/lib/lapack" --with-fmangle-lapack="one-underscore" \ --prefix=$(CURDIR)/debian/tmp/usr diff -Nru hypre-2.11.2/debian/source/lintian-overrides hypre-2.13.0/debian/source/lintian-overrides --- hypre-2.11.2/debian/source/lintian-overrides 2017-10-18 09:57:39.000000000 +0000 +++ hypre-2.13.0/debian/source/lintian-overrides 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -# Lintian thinks uploader Adam Powell's name violates policy -hypre source: uploader-address-missing "Adam C. Powell -hypre source: uploader-not-full-name IV" - -# source for these files is provided in the same directory -hypre source: source-is-missing src/FEI_mv/DSuperLU/INSTALL/testdlamch -hypre source: source-is-missing src/FEI_mv/DSuperLU/INSTALL/testslamch -hypre source: source-is-missing src/FEI_mv/DSuperLU/INSTALL/testtimer Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/docs/HYPRE_ref_manual.pdf and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/docs/HYPRE_ref_manual.pdf differ Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/docs/HYPRE_usr_manual.pdf and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/docs/HYPRE_usr_manual.pdf differ diff -Nru hypre-2.11.2/src/blas/blas_utils.c hypre-2.13.0/src/blas/blas_utils.c --- hypre-2.11.2/src/blas/blas_utils.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/blas_utils.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,219 +0,0 @@ - -#include "hypre_blas.h" -#include "f2c.h" - -logical hypre_lsame_(const char *ca,const char *cb) -{ -/* -- LAPACK auxiliary routine (version 3.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - LSAME returns .TRUE. if CA is the same letter as CB regardless of - case. - - Arguments - ========= - - CA (input) CHARACTER*1 - CB (input) CHARACTER*1 - CA and CB specify the single characters to be compared. - - ===================================================================== - - - - Test if the characters are equal */ - /* System generated locals */ - logical ret_val; - /* Local variables */ - static integer inta, intb, zcode; - - - ret_val = *(unsigned char *)ca == *(unsigned char *)cb; - if (ret_val) { - return ret_val; - } - -/* Now test for equivalence if both characters are alphabetic. */ - - zcode = 'Z'; - -/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime - machines, on which ICHAR returns a value with bit 8 set. - ICHAR('A') on Prime machines returns 193 which is the same as - ICHAR('A') on an EBCDIC machine. */ - - inta = *(unsigned char *)ca; - intb = *(unsigned char *)cb; - - if (zcode == 90 || zcode == 122) { - -/* ASCII is assumed - ZCODE is the ASCII code of either lower o -r - upper case 'Z'. */ - - if (inta >= 97 && inta <= 122) { - inta += -32; - } - if (intb >= 97 && intb <= 122) { - intb += -32; - } - - } else if (zcode == 233 || zcode == 169) { - -/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower - or - upper case 'Z'. */ - - if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || - (inta >= 162 && inta <= 169)) { - inta += 64; - } - if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || - (intb >= 162 && intb <= 169)) { - intb += 64; - } - - } else if (zcode == 218 || zcode == 250) { - -/* ASCII is assumed, on Prime machines - ZCODE is the ASCII cod -e - plus 128 of either lower or upper case 'Z'. */ - - if (inta >= 225 && inta <= 250) { - inta += -32; - } - if (intb >= 225 && intb <= 250) { - intb += -32; - } - } - ret_val = inta == intb; - -/* RETURN - - End of LSAME */ - - return ret_val; -} /* hypre_lsame_ */ - -#include -#include "f2c.h" - -/* Subroutine */ HYPRE_Int hypre_xerbla_(const char *srname, integer *info) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - XERBLA is an error handler for the LAPACK routines. - It is called by an LAPACK routine if an input parameter has an - invalid value. A message is printed and execution stops. - - Installers may consider modifying the STOP statement in order to - call system-specific exception-handling facilities. - - Arguments - ========= - - SRNAME (input) CHARACTER*6 - The name of the routine which called XERBLA. - - INFO (input) INTEGER - The position of the invalid parameter in the parameter list - - of the calling routine. - - ===================================================================== -*/ - - hypre_printf("** On entry to %6s, parameter number %2i had an illegal value\n", - srname, *info); - -/* End of XERBLA */ - - return 0; -} /* hypre_xerbla_ */ - -#include "f2c.h" -#include "hypre_blas.h" - -/* compare two strings */ - -#ifdef KR_headers -integer s_cmp(a0, b0, la, lb) char *a0;const char *b0; ftnlen la, lb; -#else -integer s_cmp(char *a0,const char *b0, ftnlen la, ftnlen lb) -#endif -{ -register unsigned char *a, *aend, *b, *bend; -a = (unsigned char *)a0; -b = (unsigned char *)b0; -aend = a + la; -bend = b + lb; - -if(la <= lb) - { - while(a < aend) - if(*a != *b) - return( *a - *b ); - else - { ++a; ++b; } - - while(b < bend) - if(*b != ' ') - return( ' ' - *b ); - else ++b; - } - -else - { - while(b < bend) - if(*a == *b) - { ++a; ++b; } - else - return( *a - *b ); - while(a < aend) - if(*a != ' ') - return(*a - ' '); - else ++a; - } -return(0); -} -//#include "f2c.h" -//#include "hypre_blas.h" - -/* assign strings: a = b */ - -#ifdef KR_headers -VOID hypre_s_copy(a, b, la, lb) char *a,const char *b; ftnlen la, lb; -#else -void hypre_s_copy(char *a,const char *b, ftnlen la, ftnlen lb) -#endif -{ -register char *aend, *bend; - -aend = a + la; - -if(la <= lb) - while(a < aend) - *a++ = *b++; - -else - { - bend = (char*)b + lb; - while(b < bend) - *a++ = *b++; - while(a < aend) - *a++ = ' '; - } -} diff -Nru hypre-2.11.2/src/blas/dasum.c hypre-2.13.0/src/blas/dasum.c --- hypre-2.11.2/src/blas/dasum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dasum.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -83,3 +85,6 @@ return ret_val; } /* dasum_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/daxpy.c hypre-2.13.0/src/blas/daxpy.c --- hypre-2.11.2/src/blas/daxpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/daxpy.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int daxpy_(integer *n, doublereal *da, doublereal *dx, +/* Subroutine */ integer daxpy_(integer *n, doublereal *da, doublereal *dx, integer *incx, doublereal *dy, integer *incy) { @@ -90,3 +92,6 @@ return 0; } /* daxpy_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dcopy.c hypre-2.13.0/src/blas/dcopy.c --- hypre-2.11.2/src/blas/dcopy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dcopy.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,3 +1,6 @@ +#ifdef __cplusplus +extern "C" { +#endif /* dcopy.f -- translated by f2c (version 19960315). You must link the resulting object file with the libraries: @@ -7,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dcopy_(integer* n, doublereal* dx,integer* incx,doublereal* dy,integer* incy) +/* Subroutine */ integer dcopy_(integer* n, doublereal* dx,integer* incx,doublereal* dy,integer* incy) { /* System generated locals */ integer i__1; @@ -87,3 +90,6 @@ return 0; } /* dcopy_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/ddot.c hypre-2.13.0/src/blas/ddot.c --- hypre-2.11.2/src/blas/ddot.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/ddot.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,3 +1,6 @@ +#ifdef __cplusplus +extern "C" { +#endif /* ddot.f -- translated by f2c (version 19960315). You must link the resulting object file with the libraries: @@ -91,3 +94,6 @@ return ret_val; } /* ddot_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dgemm.c hypre-2.13.0/src/blas/dgemm.c --- hypre-2.11.2/src/blas/dgemm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dgemm.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,3 +1,6 @@ +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -7,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dgemm_(const char *transa,const char *transb, integer *m, integer * +/* Subroutine */ integer dgemm_(const char *transa,const char *transb, integer *m, integer * n, integer *k, doublereal *alpha, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *beta, doublereal *c, integer *ldc) @@ -21,9 +24,9 @@ static logical nota, notb; static doublereal temp; static integer i, j, l; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer nrowa, nrowb; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); /* Purpose @@ -192,8 +195,8 @@ #define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] #define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)] - nota = hypre_lsame_(transa, "N"); - notb = hypre_lsame_(transb, "N"); + nota = lsame_(transa, "N"); + notb = lsame_(transb, "N"); if (nota) { nrowa = *m; } else { @@ -208,9 +211,9 @@ /* Test the input parameters. */ info = 0; - if (! nota && ! hypre_lsame_(transa, "C") && ! hypre_lsame_(transa, "T")) { + if (! nota && ! lsame_(transa, "C") && ! lsame_(transa, "T")) { info = 1; - } else if (! notb && ! hypre_lsame_(transb, "C") && ! hypre_lsame_(transb, + } else if (! notb && ! lsame_(transb, "C") && ! lsame_(transb, "T")) { info = 2; } else if (*m < 0) { @@ -227,7 +230,7 @@ info = 13; } if (info != 0) { - hypre_xerbla_("DGEMM ", &info); + xerbla_("DGEMM ", &info); return 0; } @@ -369,3 +372,7 @@ /* End of DGEMM . */ } /* dgemm_ */ + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dgemv.c hypre-2.13.0/src/blas/dgemv.c --- hypre-2.11.2/src/blas/dgemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dgemv.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dgemv_(const char *trans, integer *m, integer *n, doublereal * +/* Subroutine */ integer dgemv_(const char *trans, integer *m, integer *n, doublereal * alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -20,9 +22,9 @@ static integer info; static doublereal temp; static integer lenx, leny, i, j; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); /* Purpose @@ -131,8 +133,8 @@ #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] info = 0; - if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, "T") && ! - hypre_lsame_(trans, "C")) { + if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { info = 1; } else if (*m < 0) { info = 2; @@ -146,7 +148,7 @@ info = 11; } if (info != 0) { - hypre_xerbla_("DGEMV ", &info); + xerbla_("DGEMV ", &info); return 0; } @@ -160,7 +162,7 @@ up the start points in X and Y. */ - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { lenx = *n; leny = *m; } else { @@ -216,7 +218,7 @@ if (*alpha == 0.) { return 0; } - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { /* Form y := alpha*A*x + y. */ @@ -285,3 +287,7 @@ /* End of DGEMV . */ } /* dgemv_ */ + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dger.c hypre-2.13.0/src/blas/dger.c --- hypre-2.11.2/src/blas/dger.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dger.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,11 @@ -#include "hypre_blas.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dger_(integer *m, integer *n, doublereal *alpha, +/* Subroutine */ integer dger_(integer *m, integer *n, doublereal *alpha, doublereal *x, integer *incx, doublereal *y, integer *incy, doublereal *a, integer *lda) { @@ -11,7 +15,7 @@ static integer info; static doublereal temp; static integer i__, j, ix, jy, kx; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] /* Purpose ======= @@ -86,7 +90,7 @@ info = 9; } if (info != 0) { - hypre_xerbla_("DGER ", &info); + xerbla_("DGER ", &info); return 0; } /* Quick return if possible. */ @@ -141,3 +145,6 @@ } /* dger_ */ #undef a_ref +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dnrm2.c hypre-2.13.0/src/blas/dnrm2.c --- hypre-2.11.2/src/blas/dnrm2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dnrm2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,3 +1,6 @@ +#ifdef __cplusplus +extern "C" { +#endif /* dnrm2.f -- translated by f2c (version 19960315). You must link the resulting object file with the libraries: @@ -6,7 +9,6 @@ #include "f2c.h" #include "hypre_blas.h" -#include "math.h" doublereal dnrm2_(integer*n,doublereal* dx,integer* incx) { @@ -24,7 +26,7 @@ doublereal ret_val, d__1; /* Builtin functions */ - /*HYPRE_Real sqrt(doublereal);*/ + /*doublereal sqrt(doublereal);*/ /* Local variables */ static doublereal xmax; @@ -90,7 +92,7 @@ ix = 1; /* begin main loop */ L20: - switch ((HYPRE_Int)next) { + switch ((integer)next) { case 0: goto L30; case 1: goto L50; case 2: goto L70; @@ -200,3 +202,6 @@ return ret_val; } /* dnrm2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/drot.c hypre-2.13.0/src/blas/drot.c --- hypre-2.11.2/src/blas/drot.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/drot.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int drot_(integer *n, doublereal *dx, integer *incx, +/* Subroutine */ integer drot_(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy, doublereal *c, doublereal *s) { @@ -73,3 +75,6 @@ return 0; } /* drot_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dscal.c hypre-2.13.0/src/blas/dscal.c --- hypre-2.11.2/src/blas/dscal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dscal.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dscal_(integer *n, doublereal *da, doublereal *dx, +/* Subroutine */ integer dscal_(integer *n, doublereal *da, doublereal *dx, integer *incx) { @@ -78,3 +80,6 @@ return 0; } /* dscal_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dswap.c hypre-2.13.0/src/blas/dswap.c --- hypre-2.11.2/src/blas/dswap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dswap.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_blas.h" #include "f2c.h" +#include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dswap_(integer *n, doublereal *dx, integer *incx, +/* Subroutine */ integer dswap_(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy) { /* System generated locals */ @@ -80,3 +83,6 @@ return 0; } /* dswap_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dsymm.c hypre-2.13.0/src/blas/dsymm.c --- hypre-2.11.2/src/blas/dsymm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dsymm.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_blas.h" #include "f2c.h" +#include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dsymm_(const char *side,const char *uplo, integer *m, integer *n, +/* Subroutine */ integer dsymm_(const char *side,const char *uplo, integer *m, integer *n, doublereal *alpha, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, integer *ldc) { @@ -13,10 +16,10 @@ static integer info; static doublereal temp1, temp2; static integer i__, j, k; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer nrowa; static logical upper; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] #define c___ref(a_1,a_2) c__[(a_2)*c_dim1 + a_1] @@ -127,17 +130,17 @@ c_offset = 1 + c_dim1 * 1; c__ -= c_offset; /* Function Body */ - if (hypre_lsame_(side, "L")) { + if (lsame_(side, "L")) { nrowa = *m; } else { nrowa = *n; } - upper = hypre_lsame_(uplo, "U"); + upper = lsame_(uplo, "U"); /* Test the input parameters. */ info = 0; - if (! hypre_lsame_(side, "L") && ! hypre_lsame_(side, "R")) { + if (! lsame_(side, "L") && ! lsame_(side, "R")) { info = 1; - } else if (! upper && ! hypre_lsame_(uplo, "L")) { + } else if (! upper && ! lsame_(uplo, "L")) { info = 2; } else if (*m < 0) { info = 3; @@ -151,7 +154,7 @@ info = 12; } if (info != 0) { - hypre_xerbla_("DSYMM ", &info); + xerbla_("DSYMM ", &info); return 0; } /* Quick return if possible. */ @@ -184,7 +187,7 @@ return 0; } /* Start the operations. */ - if (hypre_lsame_(side, "L")) { + if (lsame_(side, "L")) { /* Form C := alpha*A*B + beta*C. */ if (upper) { i__1 = *n; @@ -291,3 +294,6 @@ #undef b_ref #undef a_ref +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dsymv.c hypre-2.13.0/src/blas/dsymv.c --- hypre-2.11.2/src/blas/dsymv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dsymv.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dsymv_(const char *uplo, integer *n, doublereal *alpha, +/* Subroutine */ integer dsymv_(const char *uplo, integer *n, doublereal *alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -20,9 +22,9 @@ static integer info; static doublereal temp1, temp2; static integer i, j; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); /* Purpose @@ -129,7 +131,7 @@ #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] info = 0; - if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) { + if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { info = 1; } else if (*n < 0) { info = 2; @@ -141,7 +143,7 @@ info = 10; } if (info != 0) { - hypre_xerbla_("DSYMV ", &info); + xerbla_("DSYMV ", &info); return 0; } @@ -203,7 +205,7 @@ if (*alpha == 0.) { return 0; } - if (hypre_lsame_(uplo, "U")) { + if (lsame_(uplo, "U")) { /* Form y when A is stored in upper triangle. */ @@ -287,3 +289,6 @@ } /* dsymv_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dsyr2.c hypre-2.13.0/src/blas/dsyr2.c --- hypre-2.11.2/src/blas/dsyr2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dsyr2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dsyr2_(const char *uplo, integer *n, doublereal *alpha, +/* Subroutine */ integer dsyr2_(const char *uplo, integer *n, doublereal *alpha, doublereal *x, integer *incx, doublereal *y, integer *incy, doublereal *a, integer *lda) { @@ -20,9 +22,9 @@ static integer info; static doublereal temp1, temp2; static integer i, j; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); /* Purpose @@ -128,7 +130,7 @@ #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] info = 0; - if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) { + if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { info = 1; } else if (*n < 0) { info = 2; @@ -140,7 +142,7 @@ info = 9; } if (info != 0) { - hypre_xerbla_("DSYR2 ", &info); + xerbla_("DSYR2 ", &info); return 0; } @@ -173,7 +175,7 @@ accessed sequentially with one pass through the triangular part of A. */ - if (hypre_lsame_(uplo, "U")) { + if (lsame_(uplo, "U")) { /* Form A when A is stored in the upper triangle. */ @@ -255,3 +257,6 @@ } /* dsyr2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dsyr2k.c hypre-2.13.0/src/blas/dsyr2k.c --- hypre-2.11.2/src/blas/dsyr2k.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dsyr2k.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dsyr2k_(const char *uplo,const char *trans, integer *n, integer *k, +/* Subroutine */ integer dsyr2k_(const char *uplo,const char *trans, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, integer *ldc) { @@ -13,10 +16,10 @@ static integer info; static doublereal temp1, temp2; static integer i__, j, l; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer nrowa; static logical upper; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] #define c___ref(a_1,a_2) c__[(a_2)*c_dim1 + a_1] @@ -128,17 +131,17 @@ c_offset = 1 + c_dim1 * 1; c__ -= c_offset; /* Function Body */ - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { nrowa = *n; } else { nrowa = *k; } - upper = hypre_lsame_(uplo, "U"); + upper = lsame_(uplo, "U"); info = 0; - if (! upper && ! hypre_lsame_(uplo, "L")) { + if (! upper && ! lsame_(uplo, "L")) { info = 1; - } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, - "T") && ! hypre_lsame_(trans, "C")) { + } else if (! lsame_(trans, "N") && ! lsame_(trans, + "T") && ! lsame_(trans, "C")) { info = 2; } else if (*n < 0) { info = 3; @@ -152,7 +155,7 @@ info = 12; } if (info != 0) { - hypre_xerbla_("DSYR2K", &info); + xerbla_("DSYR2K", &info); return 0; } /* Quick return if possible. */ @@ -209,7 +212,7 @@ return 0; } /* Start the operations. */ - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { /* Form C := alpha*A*B' + alpha*B*A' + C. */ if (upper) { i__1 = *n; @@ -333,3 +336,6 @@ #undef b_ref #undef a_ref +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dsyrk.c hypre-2.13.0/src/blas/dsyrk.c --- hypre-2.11.2/src/blas/dsyrk.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dsyrk.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dsyrk_(const char *uplo,const char *trans, integer *n, integer *k, +/* Subroutine */ integer dsyrk_(const char *uplo,const char *trans, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda, doublereal *beta, doublereal *c, integer *ldc) { @@ -20,10 +22,10 @@ static integer info; static doublereal temp; static integer i, j, l; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer nrowa; static logical upper; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); /* Purpose @@ -174,18 +176,18 @@ #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] #define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)] - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { nrowa = *n; } else { nrowa = *k; } - upper = hypre_lsame_(uplo, "U"); + upper = lsame_(uplo, "U"); info = 0; - if (! upper && ! hypre_lsame_(uplo, "L")) { + if (! upper && ! lsame_(uplo, "L")) { info = 1; - } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, "T") && - ! hypre_lsame_(trans, "C")) { + } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && + ! lsame_(trans, "C")) { info = 2; } else if (*n < 0) { info = 3; @@ -197,7 +199,7 @@ info = 10; } if (info != 0) { - hypre_xerbla_("DSYRK ", &info); + xerbla_("DSYRK ", &info); return 0; } @@ -252,7 +254,7 @@ /* Start the operations. */ - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { /* Form C := alpha*A*A' + beta*C. */ @@ -353,3 +355,7 @@ /* End of DSYRK . */ } /* dsyrk_ */ + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dtrmm.c hypre-2.13.0/src/blas/dtrmm.c --- hypre-2.11.2/src/blas/dtrmm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dtrmm.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_blas.h" #include "f2c.h" +#include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dtrmm_(const char *side,const char *uplo,const char *transa,const char *diag, +/* Subroutine */ integer dtrmm_(const char *side,const char *uplo,const char *transa,const char *diag, integer *m, integer *n, doublereal *alpha, doublereal *a, integer * lda, doublereal *b, integer *ldb) { @@ -13,10 +16,10 @@ static doublereal temp; static integer i__, j, k; static logical lside; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer nrowa; static logical upper; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static logical nounit; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] @@ -111,23 +114,23 @@ b_offset = 1 + b_dim1 * 1; b -= b_offset; /* Function Body */ - lside = hypre_lsame_(side, "L"); + lside = lsame_(side, "L"); if (lside) { nrowa = *m; } else { nrowa = *n; } - nounit = hypre_lsame_(diag, "N"); - upper = hypre_lsame_(uplo, "U"); + nounit = lsame_(diag, "N"); + upper = lsame_(uplo, "U"); info = 0; - if (! lside && ! hypre_lsame_(side, "R")) { + if (! lside && ! lsame_(side, "R")) { info = 1; - } else if (! upper && ! hypre_lsame_(uplo, "L")) { + } else if (! upper && ! lsame_(uplo, "L")) { info = 2; - } else if (! hypre_lsame_(transa, "N") && ! hypre_lsame_(transa, - "T") && ! hypre_lsame_(transa, "C")) { + } else if (! lsame_(transa, "N") && ! lsame_(transa, + "T") && ! lsame_(transa, "C")) { info = 3; - } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, + } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { info = 4; } else if (*m < 0) { @@ -140,7 +143,7 @@ info = 11; } if (info != 0) { - hypre_xerbla_("DTRMM ", &info); + xerbla_("DTRMM ", &info); return 0; } /* Quick return if possible. */ @@ -162,7 +165,7 @@ } /* Start the operations. */ if (lside) { - if (hypre_lsame_(transa, "N")) { + if (lsame_(transa, "N")) { /* Form B := alpha*A*B. */ if (upper) { i__1 = *n; @@ -250,7 +253,7 @@ } } } else { - if (hypre_lsame_(transa, "N")) { + if (lsame_(transa, "N")) { /* Form B := alpha*B*A. */ if (upper) { for (j = *n; j >= 1; --j) { @@ -374,3 +377,6 @@ #undef b_ref #undef a_ref +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dtrmv.c hypre-2.13.0/src/blas/dtrmv.c --- hypre-2.11.2/src/blas/dtrmv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dtrmv.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_blas.h" #include "f2c.h" +#include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dtrmv_(const char *uplo,const char *trans,const char *diag, integer *n, +/* Subroutine */ integer dtrmv_(const char *uplo,const char *trans,const char *diag, integer *n, doublereal *a, integer *lda, doublereal *x, integer *incx) { /* System generated locals */ @@ -11,9 +14,9 @@ static integer info; static doublereal temp; static integer i__, j; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer ix, jx, kx; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static logical nounit; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] /* Purpose @@ -88,12 +91,12 @@ --x; /* Function Body */ info = 0; - if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) { + if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { info = 1; - } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, - "T") && ! hypre_lsame_(trans, "C")) { + } else if (! lsame_(trans, "N") && ! lsame_(trans, + "T") && ! lsame_(trans, "C")) { info = 2; - } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, + } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { info = 3; } else if (*n < 0) { @@ -104,14 +107,14 @@ info = 8; } if (info != 0) { - hypre_xerbla_("DTRMV ", &info); + xerbla_("DTRMV ", &info); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } - nounit = hypre_lsame_(diag, "N"); + nounit = lsame_(diag, "N"); /* Set up the start point in X if the increment is not unity. This will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { @@ -121,9 +124,9 @@ } /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { /* Form x := A*x. */ - if (hypre_lsame_(uplo, "U")) { + if (lsame_(uplo, "U")) { if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { @@ -201,7 +204,7 @@ } } else { /* Form x := A'*x. */ - if (hypre_lsame_(uplo, "U")) { + if (lsame_(uplo, "U")) { if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; @@ -276,3 +279,6 @@ } /* dtrmv_ */ #undef a_ref +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dtrsm.c hypre-2.13.0/src/blas/dtrsm.c --- hypre-2.11.2/src/blas/dtrsm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dtrsm.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dtrsm_(const char *side,const char *uplo,const char *transa,const char *diag, +/* Subroutine */ integer dtrsm_(const char *side,const char *uplo,const char *transa,const char *diag, integer *m, integer *n, doublereal *alpha, doublereal *a, integer * lda, doublereal *b, integer *ldb) { @@ -21,10 +23,10 @@ static doublereal temp; static integer i, j, k; static logical lside; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer nrowa; static logical upper; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static logical nounit; @@ -184,24 +186,24 @@ #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] #define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] - lside = hypre_lsame_(side, "L"); + lside = lsame_(side, "L"); if (lside) { nrowa = *m; } else { nrowa = *n; } - nounit = hypre_lsame_(diag, "N"); - upper = hypre_lsame_(uplo, "U"); + nounit = lsame_(diag, "N"); + upper = lsame_(uplo, "U"); info = 0; - if (! lside && ! hypre_lsame_(side, "R")) { + if (! lside && ! lsame_(side, "R")) { info = 1; - } else if (! upper && ! hypre_lsame_(uplo, "L")) { + } else if (! upper && ! lsame_(uplo, "L")) { info = 2; - } else if (! hypre_lsame_(transa, "N") && ! hypre_lsame_(transa, "T") - && ! hypre_lsame_(transa, "C")) { + } else if (! lsame_(transa, "N") && ! lsame_(transa, "T") + && ! lsame_(transa, "C")) { info = 3; - } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, "N")) { + } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { info = 4; } else if (*m < 0) { info = 5; @@ -213,7 +215,7 @@ info = 11; } if (info != 0) { - hypre_xerbla_("DTRSM ", &info); + xerbla_("DTRSM ", &info); return 0; } @@ -239,7 +241,7 @@ /* Start the operations. */ if (lside) { - if (hypre_lsame_(transa, "N")) { + if (lsame_(transa, "N")) { /* Form B := alpha*inv( A )*B. */ @@ -327,7 +329,7 @@ } } } else { - if (hypre_lsame_(transa, "N")) { + if (lsame_(transa, "N")) { /* Form B := alpha*B*inv( A ). */ @@ -451,3 +453,7 @@ /* End of DTRSM . */ } /* dtrsm_ */ + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/dtrsv.c hypre-2.13.0/src/blas/dtrsv.c --- hypre-2.11.2/src/blas/dtrsv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/dtrsv.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -8,7 +10,7 @@ #include "f2c.h" #include "hypre_blas.h" -/* Subroutine */ HYPRE_Int dtrsv_(const char *uplo,const char *trans,const char *diag, integer *n, +/* Subroutine */ integer dtrsv_(const char *uplo,const char *trans,const char *diag, integer *n, doublereal *a, integer *lda, doublereal *x, integer *incx) { @@ -19,9 +21,9 @@ static integer info; static doublereal temp; static integer i, j; - extern logical hypre_lsame_(const char *,const char *); + extern logical lsame_(const char *,const char *); static integer ix, jx, kx; - extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static logical nounit; @@ -137,12 +139,12 @@ #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] info = 0; - if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) { + if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { info = 1; - } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, "T") && - ! hypre_lsame_(trans, "C")) { + } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && + ! lsame_(trans, "C")) { info = 2; - } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, "N")) { + } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { info = 3; } else if (*n < 0) { info = 4; @@ -152,7 +154,7 @@ info = 8; } if (info != 0) { - hypre_xerbla_("DTRSV ", &info); + xerbla_("DTRSV ", &info); return 0; } @@ -162,7 +164,7 @@ return 0; } - nounit = hypre_lsame_(diag, "N"); + nounit = lsame_(diag, "N"); /* Set up the start point in X if the increment is not unity. This will be ( N - 1 )*INCX too small for descending loops. */ @@ -176,11 +178,11 @@ /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ - if (hypre_lsame_(trans, "N")) { + if (lsame_(trans, "N")) { /* Form x := inv( A )*x. */ - if (hypre_lsame_(uplo, "U")) { + if (lsame_(uplo, "U")) { if (*incx == 1) { for (j = *n; j >= 1; --j) { if (X(j) != 0.) { @@ -253,7 +255,7 @@ /* Form x := inv( A' )*x. */ - if (hypre_lsame_(uplo, "U")) { + if (lsame_(uplo, "U")) { if (*incx == 1) { for (j = 1; j <= *n; ++j) { temp = X(j); @@ -327,3 +329,6 @@ } /* dtrsv_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/f2c.c hypre-2.13.0/src/blas/f2c.c --- hypre-2.11.2/src/blas/f2c.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/blas/f2c.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,149 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/*----------------------------------------------------------------------------- + * Contains functions found in the f2c library to avoid needing -lf2c + *-----------------------------------------------------------------------------*/ + +#include "f2c.h" +#include "hypre_blas.h" + +/* compare two strings */ + +integer s_cmp(char *a0,const char *b0, ftnlen la, ftnlen lb) +{ +register unsigned char *a, *aend, *b, *bend; +a = (unsigned char *)a0; +b = (unsigned char *)b0; +aend = a + la; +bend = b + lb; + +if(la <= lb) + { + while(a < aend) + if(*a != *b) + return( *a - *b ); + else + { ++a; ++b; } + + while(b < bend) + if(*b != ' ') + return( ' ' - *b ); + else ++b; + } + +else + { + while(b < bend) + if(*a == *b) + { ++a; ++b; } + else + return( *a - *b ); + while(a < aend) + if(*a != ' ') + return(*a - ' '); + else ++a; + } +return(0); +} + +/* assign strings: a = b */ + +integer s_copy(char *a,const char *b, ftnlen la, ftnlen lb) +{ +register char *aend, *bend; + +aend = a + la; + +if(la <= lb) + while(a < aend) + *a++ = *b++; + +else + { + bend = (char*)b + lb; + while(b < bend) + *a++ = *b++; + while(a < aend) + *a++ = ' '; + } +return(0); +} + +integer s_cat(char *lp, char *rpp[], ftnlen rnp[], ftnlen *np, ftnlen ll) +{ +ftnlen i, n, nc; +char *f__rp; + +n = (integer)*np; +for(i = 0 ; i < n ; ++i) + { + nc = ll; + if(rnp[i] < nc) + nc = rnp[i]; + ll -= nc; + f__rp = rpp[i]; + while(--nc >= 0) + *lp++ = *f__rp++; + } +while(--ll >= 0) + *lp++ = ' '; +return 0; +} + +#define log10e 0.43429448190325182765 + +#undef abs +#include "math.h" +doublereal d_lg10(doublereal *x) +{ +return( log10e * log(*x) ); +} + +doublereal d_sign(doublereal *a, doublereal *b) +{ +doublereal x; +x = (*a >= 0 ? *a : - *a); +return( *b >= 0 ? x : -x); +} + +doublereal pow_di(doublereal *ap, integer *bp) +{ +doublereal pow, x; +integer n; + +pow = 1; +x = *ap; +n = *bp; + +if(n != 0) + { + if(n < 0) + { + n = -n; + x = 1/x; + } + for( ; ; ) + { + if(n & 01) + pow *= x; + if(n >>= 1) + x *= x; + else + break; + } + } +return(pow); +} + +#undef abs +#include "math.h" +doublereal pow_dd(doublereal *ap, doublereal *bp) +{ +return(pow(*ap, *bp) ); +} + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/f2c.h hypre-2.13.0/src/blas/f2c.h --- hypre-2.11.2/src/blas/f2c.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/f2c.h 2017-10-20 17:42:22.000000000 +0000 @@ -25,6 +25,13 @@ #define HYPRE_SEQUENTIAL #endif #include "_hypre_utilities.h" +#include "math.h" + +#if defined(HYPRE_SINGLE) +#define sqrt sqrtf +#elif defined(HYPRE_LONG_DOUBLE) +#define sqrt sqrtl +#endif #ifdef HYPRE_BIGINT typedef long long int HYPRE_LongInt; diff -Nru hypre-2.11.2/src/blas/_hypre_blas.h hypre-2.13.0/src/blas/_hypre_blas.h --- hypre-2.11.2/src/blas/_hypre_blas.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/blas/_hypre_blas.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,130 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header file for HYPRE BLAS + * + *****************************************************************************/ + +#ifndef HYPRE_BLAS_H +#define HYPRE_BLAS_H + +#include "_hypre_utilities.h" +#include "fortran.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*-------------------------------------------------------------------------- + * Change all 'hypre_' names based on using HYPRE or external library + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_USING_HYPRE_BLAS + +#define hypre_dasum hypre_F90_NAME_BLAS(dasum ,DASUM ) +#define hypre_daxpy hypre_F90_NAME_BLAS(daxpy ,DAXPY ) +#define hypre_dcopy hypre_F90_NAME_BLAS(dcopy ,DCOPY ) +#define hypre_ddot hypre_F90_NAME_BLAS(ddot ,DDOT ) +#define hypre_dgemm hypre_F90_NAME_BLAS(dgemm ,DGEMM ) +#define hypre_dgemv hypre_F90_NAME_BLAS(dgemv ,DGEMV ) +#define hypre_dger hypre_F90_NAME_BLAS(dger ,DGER ) +#define hypre_dnrm2 hypre_F90_NAME_BLAS(dnrm2 ,DNRM2 ) +#define hypre_drot hypre_F90_NAME_BLAS(drot ,DROT ) +#define hypre_dscal hypre_F90_NAME_BLAS(dscal ,DSCAL ) +#define hypre_dswap hypre_F90_NAME_BLAS(dswap ,DSWAP ) +#define hypre_dsymm hypre_F90_NAME_BLAS(dsymm ,DSYMM ) +#define hypre_dsymv hypre_F90_NAME_BLAS(dsymv ,DSYMV ) +#define hypre_dsyr2 hypre_F90_NAME_BLAS(dsyr2 ,DSYR2 ) +#define hypre_dsyr2k hypre_F90_NAME_BLAS(dsyr2k,DSYR2K) +#define hypre_dsyrk hypre_F90_NAME_BLAS(dsyrk ,DSYRK ) +#define hypre_dtrmm hypre_F90_NAME_BLAS(dtrmm ,DTRMM ) +#define hypre_dtrmv hypre_F90_NAME_BLAS(dtrmv ,DTRMV ) +#define hypre_dtrsm hypre_F90_NAME_BLAS(dtrsm ,DTRSM ) +#define hypre_dtrsv hypre_F90_NAME_BLAS(dtrsv ,DTRSV ) +#define hypre_idamax hypre_F90_NAME_BLAS(idamax,IDAMAX) + +#endif + +/*-------------------------------------------------------------------------- + * Prototypes + *--------------------------------------------------------------------------*/ + +/* dasum.c */ +HYPRE_Real hypre_dasum ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx ); + +/* daxpy.c */ +HYPRE_Int hypre_daxpy ( HYPRE_Int *n , HYPRE_Real *da , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy ); + +/* dcopy.c */ +HYPRE_Int hypre_dcopy ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy ); + +/* ddot.c */ +HYPRE_Real hypre_ddot ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy ); + +/* dgemm.c */ +HYPRE_Int hypre_dgemm ( const char *transa , const char *transb , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *beta , HYPRE_Real *c , HYPRE_Int *ldc ); + +/* dgemv.c */ +HYPRE_Int hypre_dgemv ( const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *beta , HYPRE_Real *y , HYPRE_Int *incy ); + +/* dger.c */ +HYPRE_Int hypre_dger ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *y , HYPRE_Int *incy , HYPRE_Real *a , HYPRE_Int *lda ); + +/* dnrm2.c */ +HYPRE_Real hypre_dnrm2 ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx ); + +/* drot.c */ +HYPRE_Int hypre_drot ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy , HYPRE_Real *c , HYPRE_Real *s ); + +/* dscal.c */ +HYPRE_Int hypre_dscal ( HYPRE_Int *n , HYPRE_Real *da , HYPRE_Real *dx , HYPRE_Int *incx ); + +/* dswap.c */ +HYPRE_Int hypre_dswap ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy ); + +/* dsymm.c */ +HYPRE_Int hypre_dsymm ( const char *side , const char *uplo , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *beta , HYPRE_Real *c__ , HYPRE_Int *ldc ); + +/* dsymv.c */ +HYPRE_Int hypre_dsymv ( const char *uplo , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *beta , HYPRE_Real *y , HYPRE_Int *incy ); + +/* dsyr2.c */ +HYPRE_Int hypre_dsyr2 ( const char *uplo , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *y , HYPRE_Int *incy , HYPRE_Real *a , HYPRE_Int *lda ); + +/* dsyr2k.c */ +HYPRE_Int hypre_dsyr2k ( const char *uplo , const char *trans , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *beta , HYPRE_Real *c__ , HYPRE_Int *ldc ); + +/* dsyrk.c */ +HYPRE_Int hypre_dsyrk ( const char *uplo , const char *trans , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *beta , HYPRE_Real *c , HYPRE_Int *ldc ); + +/* dtrmm.c */ +HYPRE_Int hypre_dtrmm ( const char *side , const char *uplo , const char *transa , const char *diag , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb ); + +/* dtrmv.c */ +HYPRE_Int hypre_dtrmv ( const char *uplo , const char *trans , const char *diag , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx ); + +/* dtrsm.c */ +HYPRE_Int hypre_dtrsm ( const char *side , const char *uplo , const char *transa , const char *diag , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb ); + +/* dtrsv.c */ +HYPRE_Int hypre_dtrsv ( const char *uplo , const char *trans , const char *diag , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx ); + +/* idamax.c */ +HYPRE_Int hypre_idamax ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx ); + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru hypre-2.11.2/src/blas/hypre_blas.h hypre-2.13.0/src/blas/hypre_blas.h --- hypre-2.11.2/src/blas/hypre_blas.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/hypre_blas.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,116 +10,45 @@ * $Revision$ ***********************************************************************EHEADER*/ -/* hypre_blas.h -- Contains BLAS prototypes needed by Hypre */ +/***** DO NOT use this file outside of the BLAS directory *****/ -#ifndef HYPRE_BLAS_H -#define HYPRE_BLAS_H -#include "f2c.h" -#include "fortran.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* -------------------------------------------------------------------------- - * Change all names to hypre_ to avoid link conflicts - * --------------------------------------------------------------------------*/ - -#define dasum_ hypre_F90_NAME_BLAS(dasum,DASUM) -#define daxpy_ hypre_F90_NAME_BLAS(daxpy,DAXPY) -#define dcopy_ hypre_F90_NAME_BLAS(dcopy,DCOPY) -#define ddot_ hypre_F90_NAME_BLAS(ddot,DDOT) -#define dgemm_ hypre_F90_NAME_BLAS(dgemm,DGEMM) -#define dgemv_ hypre_F90_NAME_BLAS(dgemv,DGEMV) -#define dger_ hypre_F90_NAME_BLAS(dger,DGER) -#define dnrm2_ hypre_F90_NAME_BLAS(dnrm2,DNRM2) -#define drot_ hypre_F90_NAME_BLAS(drot,DROT) -#define dscal_ hypre_F90_NAME_BLAS(dscal,DSCAL) -#define dswap_ hypre_F90_NAME_BLAS(dswap,DSWAP) -#define dsymm_ hypre_F90_NAME_BLAS(dsymm,DSYMM) -#define dsymv_ hypre_F90_NAME_BLAS(dsymv,DSYMV) -#define dsyr2_ hypre_F90_NAME_BLAS(dsyr2,DSYR2) -#define dsyr2k_ hypre_F90_NAME_BLAS(dsyr2k,DSYR2K) -#define dsyrk_ hypre_F90_NAME_BLAS(dsyrk,DSYRK) -#define dtrmm_ hypre_F90_NAME_BLAS(dtrmm,DTRMM) -#define dtrmv_ hypre_F90_NAME_BLAS(dtrmv,DTRMV) -#define dtrsm_ hypre_F90_NAME_BLAS(dtrsm,DTRSM) -#define dtrsv_ hypre_F90_NAME_BLAS(dtrsv,DTRSV) -#define idamax_ hypre_F90_NAME_BLAS(idamax,IDAMAX) -#define s_cmp hypre_F90_NAME_BLAS(s_cmp,S_CMP) -#define s_copy hypre_F90_NAME_BLAS(s_copy,S_COPY) - -/* blas_utils.c */ -logical lsame_ ( const char *ca ,const char *cb ); -HYPRE_Int xerbla_ ( const char *srname , integer *info ); -integer s_cmp ( char *a0 , const char *b0 , ftnlen la , ftnlen lb ); -VOID s_copy ( char *a , const char *b , ftnlen la , ftnlen lb ); - -/* dasum.c */ -doublereal dasum_ ( integer *n , doublereal *dx , integer *incx ); - -/* daxpy.c */ -HYPRE_Int daxpy_ ( integer *n , doublereal *da , doublereal *dx , integer *incx , doublereal *dy , integer *incy ); - -/* dcopy.c */ -HYPRE_Int dcopy_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy ); - -/* ddot.c */ -doublereal ddot_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy ); - -/* dgemm.c */ -HYPRE_Int dgemm_ ( const char *transa , const char *transb , integer *m , integer *n , integer *k , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *beta , doublereal *c , integer *ldc ); - -/* dgemv.c */ -HYPRE_Int dgemv_ ( const char *trans , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *x , integer *incx , doublereal *beta , doublereal *y , integer *incy ); - -/* dger.c */ -HYPRE_Int dger_ ( integer *m , integer *n , doublereal *alpha , doublereal *x , integer *incx , doublereal *y , integer *incy , doublereal *a , integer *lda ); - -/* dnrm2.c */ -doublereal dnrm2_ ( integer *n , doublereal *dx , integer *incx ); - -/* drot.c */ -HYPRE_Int drot_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy , doublereal *c , doublereal *s ); - -/* dscal.c */ -HYPRE_Int dscal_ ( integer *n , doublereal *da , doublereal *dx , integer *incx ); - -/* dswap.c */ -HYPRE_Int dswap_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy ); - -/* dsymm.c */ -HYPRE_Int dsymm_ ( const char *side , const char *uplo , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *beta , doublereal *c__ , integer *ldc ); - -/* dsymv.c */ -HYPRE_Int dsymv_ ( const char *uplo , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *x , integer *incx , doublereal *beta , doublereal *y , integer *incy ); - -/* dsyr2.c */ -HYPRE_Int dsyr2_ ( const char *uplo , integer *n , doublereal *alpha , doublereal *x , integer *incx , doublereal *y , integer *incy , doublereal *a , integer *lda ); - -/* dsyr2k.c */ -HYPRE_Int dsyr2k_ ( const char *uplo , const char *trans , integer *n , integer *k , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *beta , doublereal *c__ , integer *ldc ); - -/* dsyrk.c */ -HYPRE_Int dsyrk_ ( const char *uplo , const char *trans , integer *n , integer *k , doublereal *alpha , doublereal *a , integer *lda , doublereal *beta , doublereal *c , integer *ldc ); - -/* dtrmm.c */ -HYPRE_Int dtrmm_ ( const char *side , const char *uplo , const char *transa , const char *diag , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb ); - -/* dtrmv.c */ -HYPRE_Int dtrmv_ ( const char *uplo , const char *trans , const char *diag , integer *n , doublereal *a , integer *lda , doublereal *x , integer *incx ); - -/* dtrsm.c */ -HYPRE_Int dtrsm_ ( const char *side , const char *uplo , const char *transa , const char *diag , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb ); - -/* dtrsv.c */ -HYPRE_Int dtrsv_ ( const char *uplo , const char *trans , const char *diag , integer *n , doublereal *a , integer *lda , doublereal *x , integer *incx ); - -/* idamax.c */ -integer idamax_ ( integer *n , doublereal *dx , integer *incx ); - -#ifdef __cplusplus -} -#endif +/*-------------------------------------------------------------------------- + * This header renames the functions in BLAS to avoid conflicts + *--------------------------------------------------------------------------*/ + +/* blas */ +#define dasum_ hypre_dasum +#define daxpy_ hypre_daxpy +#define dcopy_ hypre_dcopy +#define ddot_ hypre_ddot +#define dgemm_ hypre_dgemm +#define dgemv_ hypre_dgemv +#define dger_ hypre_dger +#define dnrm2_ hypre_dnrm2 +#define drot_ hypre_drot +#define dscal_ hypre_dscal +#define dswap_ hypre_dswap +#define dsymm_ hypre_dsymm +#define dsymv_ hypre_dsymv +#define dsyr2_ hypre_dsyr2 +#define dsyr2k_ hypre_dsyr2k +#define dsyrk_ hypre_dsyrk +#define dtrmm_ hypre_dtrmm +#define dtrmv_ hypre_dtrmv +#define dtrsm_ hypre_dtrsm +#define dtrsv_ hypre_dtrsv +#define idamax_ hypre_idamax + +/* f2c library routines */ +#define s_cmp hypre_s_cmp +#define s_copy hypre_s_copy +#define s_cat hypre_s_cat +#define d_lg10 hypre_d_lg10 +#define d_sign hypre_d_sign +#define pow_dd hypre_pow_dd +#define pow_di hypre_pow_di + +/* these auxiliary routines have a different definition in LAPACK */ +#define lsame_ hypre_blas_lsame +#define xerbla_ hypre_blas_xerbla -#endif diff -Nru hypre-2.11.2/src/blas/idamax.c hypre-2.13.0/src/blas/idamax.c --- hypre-2.11.2/src/blas/idamax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/idamax.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,6 @@ - +#ifdef __cplusplus +extern "C" { +#endif /* -- translated by f2c (version 19940927). You must link the resulting object file with the libraries: @@ -78,3 +80,6 @@ return ret_val; } /* idamax_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/lsame.c hypre-2.13.0/src/blas/lsame.c --- hypre-2.11.2/src/blas/lsame.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/blas/lsame.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,125 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/* lsame.f -- translated by f2c (version 20061008). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" +#include "hypre_blas.h" + +logical lsame_(const char *ca, const char *cb) +{ + /* System generated locals */ + logical ret_val; + + /* Local variables */ + integer inta, intb, zcode; + + +/* -- LAPACK auxiliary routine (version 3.1) -- */ +/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ +/* November 2006 */ + +/* .. Scalar Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */ +/* case. */ + +/* Arguments */ +/* ========= */ + +/* CA (input) CHARACTER*1 */ + +/* CB (input) CHARACTER*1 */ +/* CA and CB specify the single characters to be compared. */ + +/* ===================================================================== */ + +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ + +/* Test if the characters are equal */ + + ret_val = *(unsigned char *)ca == *(unsigned char *)cb; + if (ret_val) { + return ret_val; + } + +/* Now test for equivalence if both characters are alphabetic. */ + + zcode = 'Z'; + +/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */ +/* machines, on which ICHAR returns a value with bit 8 set. */ +/* ICHAR('A') on Prime machines returns 193 which is the same as */ +/* ICHAR('A') on an EBCDIC machine. */ + + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + +/* ASCII is assumed - ZCODE is the ASCII code of either lower or */ +/* upper case 'Z'. */ + + if (inta >= 97 && inta <= 122) { + inta += -32; + } + if (intb >= 97 && intb <= 122) { + intb += -32; + } + + } else if (zcode == 233 || zcode == 169) { + +/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ +/* upper case 'Z'. */ + + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || + (inta >= 162 && inta <= 169)) { + inta += 64; + } + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || + (intb >= 162 && intb <= 169)) { + intb += 64; + } + + } else if (zcode == 218 || zcode == 250) { + +/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */ +/* plus 128 of either lower or upper case 'Z'. */ + + if (inta >= 225 && inta <= 250) { + inta += -32; + } + if (intb >= 225 && intb <= 250) { + intb += -32; + } + } + ret_val = inta == intb; + +/* RETURN */ + +/* End of LSAME */ + + return ret_val; +} /* lsame_ */ + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/blas/Makefile hypre-2.13.0/src/blas/Makefile --- hypre-2.11.2/src/blas/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -22,28 +22,30 @@ BLAS_HEADERS = f2c.h hypre_blas.h BLAS_FILES = \ -blas_utils.c\ -dasum.c\ -daxpy.c\ -dcopy.c\ -ddot.c\ -dgemm.c\ -dgemv.c\ -dger.c\ -dnrm2.c\ -drot.c\ -dscal.c\ -dswap.c\ -dsymm.c\ -dsymv.c\ -dsyr2.c\ -dsyr2k.c\ -dsyrk.c\ -dtrmm.c\ -dtrmv.c\ -dtrsm.c\ -dtrsv.c\ -idamax.c + dasum.c\ + daxpy.c\ + dcopy.c\ + ddot.c\ + dgemm.c\ + dgemv.c\ + dger.c\ + dnrm2.c\ + drot.c\ + dscal.c\ + dswap.c\ + dsymm.c\ + dsymv.c\ + dsyr2.c\ + dsyr2k.c\ + dsyrk.c\ + dtrmm.c\ + dtrmv.c\ + dtrsm.c\ + dtrsv.c\ + f2c.c\ + idamax.c\ + lsame.c\ + xerbla.c OBJS = ${BLAS_FILES:.c=.o} @@ -54,6 +56,7 @@ all: ${OBJS} install: all + cp -fR $(srcdir)/_hypre_blas.h $(HYPRE_INC_INSTALL) clean: rm -rf *.o diff -Nru hypre-2.11.2/src/blas/README hypre-2.13.0/src/blas/README --- hypre-2.11.2/src/blas/README 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/blas/README 2017-10-20 17:42:22.000000000 +0000 @@ -10,22 +10,29 @@ # $Revision$ #EHEADER********************************************************************** +HYPRE BLAS README file +The source in the HYPRE BLAS and LAPACK is taken from CLAPACK and most recently +based on release 3.2.1 (though many of the files here are much older). To add a +new BLAS or LAPACK routine, copy the file to the appropriate directory, then do +the following: +- Add C include guards at the beginning and end of the file to allow for C++ compilation +- Change the 'blaswrap.h' include file to either 'hypre_blas.h' or 'hypre_lapack.h' +- Change 'int' to 'integer' to avoid errors in the autotest check-int script +- Add 'const' in front of 'char *' in prototypes as required by C++ (use the + warnings from the C++ compiler to determine where the changes are needed) -HYPRE BLAS README file +- Add the #define name changes to 'hypre_blas.h' and/or 'hypre_lapack.h'. + Organize things alphabetically and by routine type. Note that the blas + renaming needs to be replicated in 'hypre_lapack.h'. -This is a subset of the CBLAS from netlib.org. -o The LAPACK/blas auxiliary routine xerbla_ has been - renamed through this directory as hypre_xerbla_, to - reduce name conflicts with the same routine in the - HYPRE lapack directory. -o The LAPACK/blas auxiliary routine lsame_ has been - renamed through this directory as hypre_lsame_, for - the same reason as above. -o The routines xerbla_ and lsame_ have been moved into - the blas_utils.c file. +- Create a hypre_ prototype for the main BLAS and LAPACK routines in either + '_hypre_blas.h' or '_hypre_lapack.h'. Do not create prototypes for auxiliary + routines or f2c-library routines. +- To determine which routines are auxiliary routines, look at the comments in + the C files (search for 'auxiliary'). diff -Nru hypre-2.11.2/src/blas/xerbla.c hypre-2.13.0/src/blas/xerbla.c --- hypre-2.11.2/src/blas/xerbla.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/blas/xerbla.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,86 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/* xerbla.f -- translated by f2c (version 20061008). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" +#include "hypre_blas.h" + +/* Table of constant values */ + +/*static integer c__1 = 1;*/ + +/* Subroutine */ integer xerbla_(const char *srname, integer *info) +{ + /* Format strings */ + /* + static char fmt_9999[] = "(\002 ** On entry to \002,a,\002 parameter num" + "ber \002,i2,\002 had \002,\002an illegal value\002)"; + */ + + /* Builtin functions */ + integer s_wsfe(cilist *), i_len_trim(char *, ftnlen), do_fio(integer *, + char *, ftnlen), e_wsfe(void); + /* Subroutine */ integer s_stop(char *, ftnlen); + + /* Fortran I/O blocks */ + /*static cilist io___1 = { 0, 6, 0, fmt_9999, 0 };*/ + + + +/* -- LAPACK auxiliary routine (preliminary version) -- */ +/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ +/* November 2006 */ + +/* .. Scalar Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* XERBLA is an error handler for the LAPACK routines. */ +/* It is called by an LAPACK routine if an input parameter has an */ +/* invalid value. A message is printed and execution stops. */ + +/* Installers may consider modifying the STOP statement in order to */ +/* call system-specific exception-handling facilities. */ + +/* Arguments */ +/* ========= */ + +/* SRNAME (input) CHARACTER*(*) */ +/* The name of the routine which called XERBLA. */ + +/* INFO (input) INTEGER */ +/* The position of the invalid parameter in the parameter list */ +/* of the calling routine. */ + +/* ===================================================================== */ + +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Executable Statements .. */ + + hypre_printf("** On entry to %6s, parameter number %2i had an illegal value\n", + srname, *info); + + +/* End of XERBLA */ + + return 0; +} /* xerbla_ */ + +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/CMakeLists.txt hypre-2.13.0/src/CMakeLists.txt --- hypre-2.11.2/src/CMakeLists.txt 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/CMakeLists.txt 2017-10-20 17:42:22.000000000 +0000 @@ -2,8 +2,8 @@ project (hypre) # The version number. -set (HYPRE_VERSION 2.11.2) -set (HYPRE_DATE 2017/03/13) +set (HYPRE_VERSION 2.13.0) +set (HYPRE_DATE 2017/10/20) set (HYPRE_TIME 00:00:00) set (HYPRE_BUGS hypre-support@llnl.gov) set (HYPRE_SRCDIR "${PROJECT_SOURCE_DIR}") @@ -25,6 +25,8 @@ # Configuration options option(HYPRE_SHARED "Build a shared library" OFF) option(HYPRE_BIGINT "Use long long int for HYPRE_Int" OFF) +option(HYPRE_SINGLE "Use float for HYPRE_Real" OFF) +option(HYPRE_LONG_DOUBLE "Use long double for HYPRE_Real" OFF) option(HYPRE_SEQUENTIAL "Compile without MPI" OFF) option(HYPRE_TIMING "Use HYPRE timing routines" OFF) option(HYPRE_USING_HYPRE_BLAS "Use internal BLAS library" ON) @@ -44,6 +46,14 @@ set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE) endif () +if (HYPRE_SINGLE) + set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE) +endif () + +if (HYPRE_LONG_DOUBLE) + set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE) +endif () + if (HYPRE_SEQUENTIAL) set (HYPRE_NO_GLOBAL_PARTITION OFF CACHE BOOL "" FORCE) endif () @@ -65,9 +75,7 @@ ) # Headers and sources: blas -if (HYPRE_USING_HYPRE_BLAS) list (APPEND HYPRE_SOURCES - blas/blas_utils.c blas/dasum.c blas/daxpy.c blas/dcopy.c @@ -88,12 +96,13 @@ blas/dtrmv.c blas/dtrsm.c blas/dtrsv.c + blas/f2c.c blas/idamax.c + blas/lsame.c + blas/xerbla.c ) -endif (HYPRE_USING_HYPRE_BLAS) # Headers and sources: lapack -if (HYPRE_USING_HYPRE_LAPACK) list (APPEND HYPRE_SOURCES lapack/dbdsqr.c lapack/dgebd2.c @@ -163,11 +172,9 @@ lapack/dsytrd.c lapack/ieeeck.c lapack/ilaenv.c - lapack/lapack_utils.c lapack/lsame.c lapack/xerbla.c ) -endif (HYPRE_USING_HYPRE_LAPACK) # Headers and sources: utilities list (APPEND HYPRE_HEADERS @@ -456,6 +463,7 @@ parcsr_ls/HYPRE_parcsr_lgmres.c parcsr_ls/HYPRE_parcsr_hybrid.c parcsr_ls/HYPRE_parcsr_int.c + parcsr_ls/HYPRE_parcsr_mgr.c parcsr_ls/HYPRE_parcsr_ParaSails.c parcsr_ls/HYPRE_parcsr_pcg.c parcsr_ls/HYPRE_parcsr_pilut.c @@ -470,6 +478,7 @@ parcsr_ls/par_cg_relax_wt.c parcsr_ls/par_coarsen.c parcsr_ls/par_cgc_coarsen.c + parcsr_ls/par_cheby.c parcsr_ls/par_coarse_parms.c parcsr_ls/par_coordinates.c parcsr_ls/par_cr.c @@ -485,12 +494,16 @@ parcsr_ls/par_laplace_9pt.c parcsr_ls/par_laplace.c parcsr_ls/par_lr_interp.c + parcsr_ls/par_mgr.c + parcsr_ls/par_mgr_setup.c + parcsr_ls/par_mgr_solve.c parcsr_ls/par_nongalerkin.c parcsr_ls/par_nodal_systems.c parcsr_ls/par_rap.c parcsr_ls/par_rap_communication.c parcsr_ls/par_rotate_7pt.c parcsr_ls/par_vardifconv.c + parcsr_ls/par_vardifconv_rs.c parcsr_ls/par_relax.c parcsr_ls/par_relax_more.c parcsr_ls/par_relax_interface.c diff -Nru hypre-2.11.2/src/config/config.guess hypre-2.13.0/src/config/config.guess --- hypre-2.11.2/src/config/config.guess 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/config/config.guess 2017-10-20 17:42:22.000000000 +0000 @@ -2,7 +2,7 @@ # Attempt to guess a canonical system name. # Copyright 1992-2016 Free Software Foundation, Inc. -timestamp='2016-10-02' +timestamp='2017-03-02' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -1009,7 +1009,23 @@ or32:Linux:*:* | or1k*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; - padre:Linux:*:*) + ppc64le:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + padre:Linux:*:* echo sparc-unknown-linux-${LIBC} exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) diff -Nru hypre-2.11.2/src/config/configure.in hypre-2.13.0/src/config/configure.in --- hypre-2.11.2/src/config/configure.in 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/config/configure.in 2017-10-20 17:42:22.000000000 +0000 @@ -55,8 +55,8 @@ dnl ********************************************************************* m4_define([M4_HYPRE_NAME], [hypre]) -m4_define([M4_HYPRE_VERSION], [2.11.2]) -m4_define([M4_HYPRE_DATE], [2017/03/13]) +m4_define([M4_HYPRE_VERSION], [2.13.0]) +m4_define([M4_HYPRE_DATE], [2017/10/20]) m4_define([M4_HYPRE_TIME], [00:00:00]) m4_define([M4_HYPRE_BUGS], [hypre-support@llnl.gov]) m4_define([M4_HYPRE_SRCDIR], [`pwd`]) @@ -133,18 +133,26 @@ hypre_user_chose_blas=no hypre_user_chose_lapack=no hypre_user_chose_fei=no +hypre_user_chose_cuda=no +hypre_user_chose_raja=no +hypre_user_chose_kokkos=no hypre_using_c=yes hypre_using_cxx=yes hypre_using_mpi=yes +hypre_using_superlu=no +hypre_using_dsuperlu=no + hypre_using_fei=yes -hypre_using_superlu=yes -hypre_using_mli=yes +hypre_using_mli=no hypre_using_openmp=no hypre_using_insure=no +hypre_using_cuda=no +hypre_using_gpu=no +hypre_using_um=no hypre_using_caliper=no hypre_user_gave_caliper_lib=no @@ -217,6 +225,38 @@ AC_DEFINE(HYPRE_BIGINT, 1) fi +AC_ARG_ENABLE(single, +AS_HELP_STRING([--enable-single], + [Use single precision values (default is NO).]), +[case "${enableval}" in + yes) hypre_using_fei=no + hypre_using_single=yes ;; + no) hypre_using_single=no ;; + *) AC_MSG_ERROR([Bad value ${enableval} for --enable-single]) ;; + esac], +[hypre_using_single=no] +) +if test "$hypre_using_single" = "yes" +then + AC_DEFINE(HYPRE_SINGLE, 1) +fi + +AC_ARG_ENABLE(longdouble, +AS_HELP_STRING([--enable-longdouble], + [Use long double precision values (default is NO).]), +[case "${enableval}" in + yes) hypre_using_fei=no + hypre_using_longdouble=yes ;; + no) hypre_using_longdouble=no ;; + *) AC_MSG_ERROR([Bad value ${enableval} for --enable-longdouble]) ;; + esac], +[hypre_using_longdouble=no] +) +if test "$hypre_using_longdouble" = "yes" +then + AC_DEFINE(HYPRE_LONG_DOUBLE, 1) +fi + AC_ARG_ENABLE(complex, AS_HELP_STRING([--enable-complex], [Use complex values (default is NO).]), @@ -293,8 +333,6 @@ *) hypre_using_global_partition=yes ;; esac] ) -dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set -dnl * Need to change to a new approach that always defines variable to some value AC_ARG_ENABLE(fortran, AS_HELP_STRING([--enable-fortran], @@ -307,6 +345,20 @@ [hypre_using_fortran=yes] ) +AC_ARG_ENABLE(unified-memory, +AS_HELP_STRING([--enable-unified-memory], + [Use unified memory for allocating the memory (default is NO).]), +[case "${enableval}" in + yes) hypre_using_um=yes ;; + no) hypre_using_um=no ;; + *) hypre_using_um=no ;; + esac], +[hypre_using_um=no] +) + +dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set +dnl * Need to change to a new approach that always defines variable to some value + dnl ********************************************************************* dnl * Determine if user provided C compiler or flags dnl ********************************************************************* @@ -493,6 +545,8 @@ AC_DEFINE(HYPRE_SEQUENTIAL,1,[No MPI being used])] ) +dnl ***** MPI + AC_ARG_WITH(MPI-include, AS_HELP_STRING([--with-MPI-include=DIR], [User specifies that mpi.h is in DIR. The options @@ -546,6 +600,8 @@ [MPIFLAGS=""] ) +dnl ***** BLAS + AC_ARG_WITH(blas-lib, AS_HELP_STRING([--with-blas-lib=LIBS], [LIBS is space-separated linkable list (enclosed in quotes) of libraries @@ -588,6 +644,8 @@ hypre_blas_lib_dir_old_style=yes] ) +dnl ***** LAPACK + AC_ARG_WITH(lapack-lib, AS_HELP_STRING([--with-lapack-lib=LIBS], [LIBS is space-separated linkable list (enclosed in quotes) of libraries @@ -725,13 +783,78 @@ AS_HELP_STRING([--with-openmp], [Use OpenMP. This may affect which compiler is chosen.]), [case "${withval}" in - yes) hypre_using_openmp=yes - AC_DEFINE([HYPRE_USING_OPENMP],1,[Enable OpenMP support]) ;; + yes) hypre_using_openmp=yes;; no) hypre_using_openmp=no ;; esac], [hypre_using_openmp=no] ) +dnl ***** SuperLU + +AC_ARG_WITH(superlu, +AS_HELP_STRING([--with-superlu], + [Use external SuperLU library.]), +[case "${withval}" in + no) hypre_using_superlu=no ;; + *) hypre_using_superlu=yes ;; + esac] +) + +AS_IF([test "x$with_superlu" = "xyes"], + [AC_DEFINE(HAVE_SUPERLU, 1, [Have external SuperLU library.])], + []) + +AC_ARG_WITH(superlu-include, +AS_HELP_STRING([--with-superlu-include=DIR], + [Directory where SuperLU is installed.]), +[for superlu_inc_dir in $withval; do + SUPERLU_INCLUDE="-I$superlu_inc_dir $SUPERLU_INCLUDE" + done] +) + +AC_ARG_WITH(superlu-lib, +AS_HELP_STRING([--with-superlu-lib=LIBS], + [LIBS is space-separated linkable list (enclosed in quotes) of libraries + needed for SuperLU. OK to use -L and -l flags in the list]), +[for superlu_lib in $withval; do + SUPERLU_LIBS="$SUPERLU_LIBS $superlu_lib" + done] +) + +dnl ***** DSuperLU + +AC_ARG_WITH(dsuperlu, +AS_HELP_STRING([--with-dsuperlu], + [Use external DSuperLU library.]), +[case "${withval}" in + no) hypre_using_dsuperlu=no ;; + *) hypre_using_dsuperlu=yes ;; + esac] +) + +AS_IF([test "x$with_dsuperlu" = "xyes"], + [AC_DEFINE(HAVE_DSUPERLU, 1, [Have external DSuperLU library.])], + []) + +AC_ARG_WITH(dsuperlu-include, +AS_HELP_STRING([--with-dsuperlu-include=DIR], + [Directory where DSuperLU is installed.]), +[for dsuperlu_inc_dir in $withval; do + DSUPERLU_INCLUDE="-I$dsuperlu_inc_dir $DSUPERLU_INCLUDE" + done] +) + +AC_ARG_WITH(dsuperlu-lib, +AS_HELP_STRING([--with-dsuperlu-lib=LIBS], + [LIBS is space-separated linkable list (enclosed in quotes) of libraries + needed for DSuperLU. OK to use -L and -l flags in the list]), +[for dsuperlu_lib in $withval; do + DSUPERLU_LIBS="$DSUPERLU_LIBS $dsuperlu_lib" + done] +) + +dnl ***** FEI + AC_ARG_WITH(fei, AS_HELP_STRING([--with-fei], [Use internal FEI routines.]), @@ -741,14 +864,7 @@ esac] ) -AC_ARG_WITH(superlu, -AS_HELP_STRING([--with-superlu], - [Use internal SuperLU routines.]), -[case "${withval}" in - no) hypre_using_superlu=no ;; - *) hypre_using_superlu=yes ;; - esac] -) +dnl ***** MLI AC_ARG_WITH(mli, AS_HELP_STRING([--with-mli], @@ -759,6 +875,8 @@ esac] ) +dnl ***** MPI + AC_ARG_WITH(MPI, AS_HELP_STRING([--with-MPI], [DEFAULT: Compile with MPI. Selecting --without-MPI @@ -769,6 +887,147 @@ esac] ) +dnl ***** CUDA + +AC_ARG_WITH(cuda, +AS_HELP_STRING([--with-cuda], + [Use CUDA. Require cuda-8.0 or higher (default is NO).]), +[case "$withval" in + yes) hypre_user_chose_cuda=yes + hypre_using_cuda=yes ;; + no) hypre_using_cuda=no ;; + *) hypre_using_cuda=no ;; + esac], +[hypre_using_cuda=no] +) + +dnl ***** RAJA + +AC_ARG_WITH(raja, +AS_HELP_STRING([--with-raja], + [Use RAJA. Require RAJA package to be compiled properly (default is NO).]), +[case "$withval" in + yes) hypre_user_chose_raja=yes;; + no) hypre_user_chose_raja=no ;; + *) hypre_user_chose_raja=no ;; + esac], +[hypre_using_raja=no] +) + +dnl ***** Kokkos + +AC_ARG_WITH(kokkos, +AS_HELP_STRING([--with-kokkos], + [Use Kokkos. Require kokkos package to be compiled properly(default is NO).]), +[case "$withval" in + yes) hypre_user_chose_kokkos=yes ;; + no) hypre_user_chose_kokkos=no ;; + *) hypre_user_chose_kokkos=no ;; + esac] +) + +AC_ARG_WITH(raja-include, +AS_HELP_STRING([--with-raja-include=DIR], + [User specifies that RAJA/*.h is in DIR. The options + --with-raja-include --with-raja-libs and + --with-raja-lib-dirs must be used together.]), +[for raja_dir in $withval; do + HYPRE_RAJA_INCLUDE="$HYPRE_RAJA_INCLUDE -I$raja_dir" + done; + hypre_user_chose_raja=yes] +) + +AC_ARG_WITH(raja-lib, +AS_HELP_STRING([--with-raja-lib=LIBS], + [LIBS is space-separated linkable list (enclosed in quotes) of libraries + needed for RAJA. OK to use -L and -l flags in the list]), +[for raja_lib in $withval; do + HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB $raja_lib" + done; +hypre_user_chose_raja=yes] +) + +AC_ARG_WITH(raja-libs, +AS_HELP_STRING([--with-raja-libs=LIBS], + [LIBS is space-separated list (enclosed in quotes) of libraries + needed for RAJA (base name only). The options --with-raja-libs and + --with-raja-lib-dirs must be used together.]), +[for raja_lib in $withval; do + HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB -l$raja_lib" + done; +hypre_user_chose_raja=yes] +) + +AC_ARG_WITH(raja-lib-dirs, +AS_HELP_STRING([--with-raja-lib-dirs=DIRS], + [DIRS is space-separated list (enclosed in quotes) of + directories containing the libraries specified by + --with-raja-libs, e.g "usr/lib /usr/local/lib". + The options --with-raja-libs and --raja-blas-lib-dirs + must be used together.]), +[for raja_lib_dir in $withval; do + HYPRE_RAJA_LIB_DIR="-L$raja_lib_dir $HYPRE_RAJA_LIB_DIR" + done; + hypre_user_chose_raja=yes] +) + +AC_ARG_WITH(kokkos-include, +AS_HELP_STRING([--with-kokkos-include=DIR], + [User specifies that KOKKOS headers is in DIR. The options + --with-kokkos-include --with-kokkos-libs and + --with-kokkos-dirs must be used together.]), +[for kokkos_dir in $withval; do +HYPRE_KOKKOS_INCLUDE="$HYPRE_KOKKOS_INCLUDE -I$kokkos_dir" +done; +hypre_user_chose_kokkos=yes] +) + +AC_ARG_WITH(kokkos-lib, +AS_HELP_STRING([--with-kokkos-lib=LIBS], + [LIBS is space-separated linkable list (enclosed in quotes) of libraries + needed for KOKKOS. OK to use -L and -l flags in the list]), +[for kokkos_lib in $withval; do + HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB $kokkos_lib" + done; +hypre_user_chose_kokkos=yes] +) + +AC_ARG_WITH(kokkos-libs, +AS_HELP_STRING([--with-kokkos-libs=LIBS], + [LIBS is space-separated list (enclosed in quotes) of libraries + needed for KOKKOS (base name only). The options --with-kokkos-libs and + --with-kokkos-dirs must be used together.]), +[for kokkos_lib in $withval; do + HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB -l$kokkos_lib" + done; +hypre_user_chose_kokkos=yes] +) + +AC_ARG_WITH(kokkos-dirs, +AS_HELP_STRING([--with-kokkos-dirs=DIRS], + [DIRS is space-separated list (enclosed in quotes) of + directories containing Makefile.kokkos. + The options --with-kokkos-libs and --with-kokkos-dirs + must be used together.]), +[for kokkos_lib_dir in $withval; do + HYPRE_KOKKOS_SRC_DIR="$kokkos_lib_dir" + done; +hypre_user_chose_kokkos=yes] +) + +AC_ARG_WITH(nvcc, +AS_HELP_STRING([--with-nvcc], + [Use NVCC compiler (default is NO).]), +[case "${withval}" in + yes) hypre_using_nvcc=yes ;; + no) hypre_using_nvcc=no ;; + *) AC_MSG_ERROR([Bad value ${withval} for --with-nvcc]) ;; + esac], +[hypre_using_nvcc=no] +) + +dnl ***** Caliper + AC_ARG_WITH(caliper, AS_HELP_STRING([--with-caliper], [Use Caliper instrumentation (default is NO).]), @@ -801,22 +1060,48 @@ dnl ********************************************************************* dnl * Select compilers if not already defined by command line options dnl ********************************************************************* +if test "$hypre_using_cuda" = "yes" +then + hypre_using_fortran=no + AC_CHECK_PROGS(CXX, [nvcc]) + AC_CHECK_PROGS(CC, [nvcc]) + if test "$hypre_user_chose_cxxcompilers" = "no" + then + if test "$hypre_using_mpi" = "no" + then + if test "$hypre_using_openmp" = "yes" + then + AC_CHECK_PROGS(CUDACXX, [xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc]) + else + AC_CHECK_PROGS(CUDACXX, [xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc]) + fi + else + if test "$hypre_using_openmp" = "yes" + then + AC_CHECK_PROGS(CUDACXX, [mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC]) + else + AC_CHECK_PROGS(CUDACXX, [mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC]) + fi + fi + fi +fi + if test "$hypre_user_chose_ccompilers" = "no" then if test "$hypre_using_mpi" = "no" then if test "$hypre_using_openmp" = "yes" then - AC_CHECK_PROGS(CC, [xlc_r xlC_r icc icpc gcc g++ pgcc pgCC cc CC kcc KCC]) + AC_CHECK_PROGS(CC, [xlc_r xlC_r xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC]) else AC_CHECK_PROGS(CC, [xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC]) fi else if test "$hypre_using_openmp" = "yes" then - AC_CHECK_PROGS(CC, [mpxlc mpixlc_r mpiicc mpicc mpipgcc]) + AC_CHECK_PROGS(CC, [mpxlc mpixlc_r mpixlc mpiicc mpigcc mpicc mpipgcc]) else - AC_CHECK_PROGS(CC, [mpxlc mpixlc mpiicc mpicc mpipgcc]) + AC_CHECK_PROGS(CC, [mpxlc mpixlc mpiicc mpigcc mpicc mpipgcc]) fi fi @@ -832,16 +1117,16 @@ then if test "$hypre_using_openmp" = "yes" then - AC_CHECK_PROGS(CXX, [xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc]) + AC_CHECK_PROGS(CXX, [xlC_r xlc_r xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc]) else AC_CHECK_PROGS(CXX, [xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc]) fi else if test "$hypre_using_openmp" = "yes" then - AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx_r mpiicpc mpiCC mpicxx mpipgCC]) + AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC]) else - AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx mpiicpc mpiCC mpicxx mpipgCC]) + AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC]) fi fi @@ -933,10 +1218,10 @@ dnl * to make that happen. if test "$hypre_using_global_partition" = "no" then - if test "$hypre_using_mpi" != "no" - then +dnl if test "$hypre_using_mpi" != "no" +dnl then AC_DEFINE(HYPRE_NO_GLOBAL_PARTITION, 1) - fi +dnl fi fi dnl ********************************************************************* @@ -981,9 +1266,6 @@ fi if test "$hypre_using_hypre_blas" = "yes" then - HYPRE_BLAS_SRC_DIR="$HYPRE_SRCDIR/blas" - HYPRE_BLAS_FILES="$HYPRE_SRCDIR/blas/*.o" -dnl BLAS_DIR="$HYPRE_SRCDIR/hypre/lib" BLASLIBDIRS="" BLASLIBS="" AC_DEFINE(HYPRE_USING_HYPRE_BLAS, 1, [Using internal HYPRE routines]) @@ -1027,9 +1309,6 @@ fi if test "$hypre_using_hypre_lapack" = "yes" then - HYPRE_LAPACK_SRC_DIR="$HYPRE_SRCDIR/lapack" - HYPRE_LAPACK_FILES="$HYPRE_SRCDIR/lapack/*.o" -dnl LAPACK_DIR="$HYPRE_SRCDIR/hypre/lib" LAPACKLIBDIRS="" LAPACKLIBS="" AC_DEFINE(HYPRE_USING_HYPRE_LAPACK, 1, [Using internal HYPRE routines]) @@ -1037,28 +1316,23 @@ fi dnl ********************************************************************* -dnl * Determine if FEI libraries are needed. -dnl * if so, determine whether internal routines or user-specified -dnl * libraries are to be used. +dnl * Determine if FEI and MLI are needed. +dnl * Note that MLI requires both FEI and SuperLU. dnl ********************************************************************* if test "$hypre_using_fei" = "yes" then HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv" HYPRE_FEI_SUBDIRS="fei-hypre" HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o" - if test "$hypre_using_superlu" = "yes" - then - HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS" - HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o" - else - HYPRE_FEI_SUPERLU_FILES= - fi + HYPRE_FEI_FEMLI_FILES= if test "$hypre_using_mli" = "yes" then - HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS" - HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o" - else - HYPRE_FEI_FEMLI_FILES= + if test "$hypre_using_superlu" = "yes" + then + HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS" + HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o" + AC_DEFINE(HAVE_MLI, 1, [Using MLI.]) + fi fi if test "$hypre_user_chose_fei" = "no" then @@ -1070,7 +1344,6 @@ HYPRE_FEI_BASE_DIR= HYPRE_FEI_HYPRE_FILES= HYPRE_FEI_FEMLI_FILES= - HYPRE_FEI_SUPERLU_FILES= fi dnl ********************************************************************* @@ -1254,6 +1527,172 @@ fi dnl ********************************************************************* +dnl * Set nvcc options +dnl ********************************************************************* + +if test "$hypre_using_nvcc" = "yes" +then + AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler]) + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + CFLAGS="${CFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED " + CXXFLAGS="${CXXFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED " +else + NVCCFLAGS= + NVCCLIBS= + HYPRE_NVCC_MAKEFILE="Makefile.empty" +fi + +dnl ********************************************************************* +dnl * Set raja options +dnl ********************************************************************* +if test "$hypre_user_chose_raja" = "yes" +then + RAJA_LIBS=" $HYPRE_RAJA_LIB_DIR $HYPRE_RAJA_LIB " + if [test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"] + then + CFLAGS+=" -+ " + fi + if test "$hypre_using_cuda" = "yes" + then + RAJAFLAGS=" -lRAJA " + LDFLAGS=" -ccbin=$CUDACXX -expt-extended-lambda -Xcompiler -fopenmp -arch compute_35 -lcudart -lcuda $RAJAFLAGS " + CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register -Xcompiler $RAJAFLAGS " + CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register $RAJAFLAGS " + if test "$hypre_using_um" != "yes" + then + CFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + AC_MSG_NOTICE([*******************************************************]) + AC_MSG_NOTICE([Configuring with --with-raja and --with-cuda without unified memory.]) + AC_MSG_NOTICE([It only works for struct interface.]) + AC_MSG_NOTICE([Try to confiure with --wiht-raja --with-cuda --enable-unified-memory]) + AC_MSG_NOTICE([to use the cuda feature for the whold package]) + AC_MSG_NOTICE([*******************************************************]) + else + CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + fi + hypre_user_chose_cuda=no + else + if test "$hypre_using_openmp" = "yes" + then + CC=${CXX} + CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP " + CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP " + hypre_using_openmp=no + else + CC=${CXX} + CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA " + CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA " + fi + fi +fi +dnl ********************************************************************* +dnl * Set kokkos options +dnl ********************************************************************* + +if test "$hypre_user_chose_kokkos" = "yes" +then + if [test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"] + then + CFLAGS+=" -+ " + fi + if test "$hypre_using_cuda" = "yes" + then + LDFLAGS=" -ccbin=$CUDACXX -arch compute_35 -lcudart -lcuda" + HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos" + HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib" + CC=${CXX} + LINK_CC=$LINK_CXX + CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS " + CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS " + if test "$hypre_using_um" != "yes" + then + CFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + AC_MSG_NOTICE([*******************************************************]) + AC_MSG_NOTICE([Configuring with --with-kokkos and --with-cuda, but not with unified memory]) + AC_MSG_NOTICE([It only works for struct interface.]) + AC_MSG_NOTICE([Try to confiure with --wiht-raja --with-cuda --enable-unified-memory]) + AC_MSG_NOTICE([to use the cuda feature for the whold package]) + AC_MSG_NOTICE([*******************************************************]) + else + CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler]) + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + fi + hypre_user_chose_cuda=no + else + if test "$hypre_using_openmp" = "yes" + then + CC=${CXX} + CFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP" + CXXFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP" + HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos" + HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib" + hypre_using_openmp=no + else + CC=${CXX} + CFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS " + CXXFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS " + HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos" + HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib" + LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler]) + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + fi + fi + KOKKOS_LIBS=" $HYPRE_KOKKOS_LIB_DIR $HYPRE_KOKKOS_LIB " +fi +dnl ********************************************************************* +dnl * Set cuda options +dnl ********************************************************************* +if test "$hypre_user_chose_cuda" = "yes" +then + LDFLAGS+=" -ccbin=$CUDACXX -arch compute_35 " + CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA " + CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA " + if test "$hypre_using_um" != "yes" + then + CFLAGS+=" -DHYPRE_MEMORY_GPU" + CXXFLAGS+=" -DHYPRE_MEMORY_GPU" + AC_MSG_NOTICE([*******************************************************]) + AC_MSG_NOTICE([Configuring with --with-cuda=yes without unified memory.]) + AC_MSG_NOTICE([It only works for struct interface.]) + AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.]) + AC_MSG_NOTICE([*******************************************************]) + else + CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler]) + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + fi +fi + +if test "$hypre_using_um" = "yes" +then + LDFLAGS+=" -lcudart -lcuda " + CFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include " + CXXFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include " +fi + +if test "$hypre_using_openmp" = "yes" +then + AC_DEFINE([HYPRE_USING_OPENMP],1,[Enable OpenMP support]) +fi + +dnl ********************************************************************* dnl * Set installation directories dnl ********************************************************************* HYPRE_INSTALLDIR="${prefix}" @@ -1340,27 +1779,60 @@ AC_SUBST(HYPRE_FEI_SUBDIRS) AC_SUBST(HYPRE_FEI_HYPRE_FILES) AC_SUBST(HYPRE_FEI_FEMLI_FILES) -AC_SUBST(HYPRE_FEI_SUPERLU_FILES) dnl ********************************************************************* dnl * BLAS & LAPACK related information dnl ********************************************************************* -AC_SUBST(HYPRE_BLAS_SRC_DIR) -AC_SUBST(HYPRE_BLAS_FILES) +AC_SUBST(HYPRE_KOKKOS_PATH) AC_SUBST(BLASLIBDIRS) AC_SUBST(BLASLIBS) -AC_SUBST(HYPRE_LAPACK_SRC_DIR) -AC_SUBST(HYPRE_LAPACK_FILES) AC_SUBST(LAPACKLIBDIRS) AC_SUBST(LAPACKLIBS) dnl ********************************************************************* +dnl * RAJA information +dnl ********************************************************************* +AC_SUBST(HYPRE_RAJA_LIB_DIR) +AC_SUBST(HYPRE_RAJA_INCLUDE) +AC_SUBST(HYPRE_RAJA_LIB) +AC_SUBST(RAJA_LIBS) + +dnl ********************************************************************* +dnl * KOKKOS information +dnl ********************************************************************* +AC_SUBST(HYPRE_KOKKOS_SRC_DIR) +AC_SUBST(HYPRE_KOKKOS_LIB_DIR) +AC_SUBST(HYPRE_KOKKOS_INCLUDE) +AC_SUBST(HYPRE_KOKKOS_INC_FILE) +AC_SUBST(HYPRE_KOKKOS_LIB) +AC_SUBST(KOKKOS_LIBS) + +dnl ********************************************************************* +dnl * NVCC stuff +dnl ********************************************************************* +AC_SUBST(NVCCFLAGS) +AC_SUBST(NVCCLIBS) +AC_SUBST(HYPRE_NVCC_MAKEFILE) + +dnl ********************************************************************* dnl * Caliper instrumentation dnl ********************************************************************* AC_SUBST(CALIPER_INCLUDE) AC_SUBST(CALIPER_LIBS) dnl ********************************************************************* +dnl * SuperLU instrumentation +dnl ********************************************************************* +AC_SUBST(SUPERLU_INCLUDE) +AC_SUBST(SUPERLU_LIBS) + +dnl ********************************************************************* +dnl * DSuperLU instrumentation +dnl ********************************************************************* +AC_SUBST(DSUPERLU_INCLUDE) +AC_SUBST(DSUPERLU_LIBS) + +dnl ********************************************************************* dnl * ar & ranlib substitution dnl ********************************************************************* AC_SUBST(AR) diff -Nru hypre-2.11.2/src/config/HYPRE_config.h.cmake.in hypre-2.13.0/src/config/HYPRE_config.h.cmake.in --- hypre-2.11.2/src/config/HYPRE_config.h.cmake.in 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/config/HYPRE_config.h.cmake.in 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,12 @@ /* Use long long int for HYPRE_Int */ #cmakedefine HYPRE_BIGINT +/* Use single precision values for HYPRE_Real */ +#cmakedefine HYPRE_SINGLE + +/* Use quad precision values for HYPRE_Real */ +#cmakedefine HYPRE_LONG_DOUBLE + /* Use complex values */ #cmakedefine HYPRE_COMPLEX diff -Nru hypre-2.11.2/src/config/HYPRE_config.h.in hypre-2.13.0/src/config/HYPRE_config.h.in --- hypre-2.11.2/src/config/HYPRE_config.h.in 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/config/HYPRE_config.h.in 2017-10-20 17:42:22.000000000 +0000 @@ -49,6 +49,12 @@ /* Define to 1 if using long long int for HYPRE_Int */ #undef HYPRE_BIGINT +/* Define to 1 if using single precision values for HYPRE_Real */ +#undef HYPRE_SINGLE + +/* Define to 1 if using quad precision values for HYPRE_Real */ +#undef HYPRE_LONG_DOUBLE + /* Define to 1 if using complex values */ #undef HYPRE_COMPLEX @@ -115,5 +121,18 @@ /* As HYPRE_FC_FUNC, but for C identifiers containing underscores. */ #undef FC_FUNC_ +/* Define to 1 if nvcc is enabled */ +#undef HYPRE_USING_NVCC + /* Define to 1 if Caliper instrumentation is enabled */ #undef HYPRE_USING_CALIPER + +/* Define to 1 if using SuperLU */ +#undef HAVE_SUPERLU + +/* Define to 1 if using DSuperLU */ +#undef HAVE_DSUPERLU + +/* Define to 1 if using MLI */ +#undef HAVE_MLI + diff -Nru hypre-2.11.2/src/config/Makefile.config.in hypre-2.13.0/src/config/Makefile.config.in --- hypre-2.11.2/src/config/Makefile.config.in 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/config/Makefile.config.in 2017-10-20 17:42:22.000000000 +0000 @@ -10,7 +10,6 @@ # $Revision$ #EHEADER********************************************************************** - ############################################################################### ## All configurable variables are defined in the file named Makefile.config.in ## When Autoconf is run, it will create a file named Makefile.config which @@ -42,7 +41,7 @@ HYPRE_LIB_SUFFIX = @HYPRE_LIBSUFFIX@ .SUFFIXES: -.SUFFIXES: .o .f .c .C .cxx .cc +.SUFFIXES: .o .f .c .C .cxx .cc .cu .f.o: $(FC) $(FFLAGS) -c $< @@ -54,6 +53,8 @@ $(CXX) $(CXXFLAGS) -c $< .cc.o: $(CXX) $(CXXFLAGS) -c $< +.cu.o: + $(NVCC) $(NVCCFLAGS) -c $< FC = @FC@ FFLAGS = @FFLAGS@ @FCFLAGS@ $(FC_COMPILE_FLAGS) @@ -64,6 +65,10 @@ CXX = @CXX@ CXXFLAGS = @CXXFLAGS@ @DEFS@ $(CXX_COMPILE_FLAGS) +NVCC = nvcc +NVCCFLAGS = @NVCCFLAGS@ +NVCCLIBS = @NVCCLIBS@ + LINK_FC = @LINK_FC@ LINK_CC = @LINK_CC@ LINK_CXX = @LINK_CXX@ @@ -85,26 +90,22 @@ RANLIB = @RANLIB@ LDFLAGS = @LDFLAGS@ -LIBS = @LIBS@ @CALIPER_LIBS@ +LIBS = @LIBS@ @CALIPER_LIBS@ @NVCCLIBS@ @RAJA_LIBS@ @KOKKOS_LIBS@ FLIBS = @FLIBS@ -INCLUDES = @CALIPER_INCLUDE@ +INCLUDES = @CALIPER_INCLUDE@ @HYPRE_RAJA_INCLUDE@ @HYPRE_KOKKOS_INCLUDE@ ################################################################## ## LAPACK Library Flags ################################################################## -HYPRE_LAPACK_SRC_DIR = @HYPRE_LAPACK_SRC_DIR@ -HYPRE_LAPACK_FILES = @HYPRE_LAPACK_FILES@ -LAPACKLIBS = @LAPACKLIBS@ -LAPACKLIBDIRS = @LAPACKLIBDIRS@ +LAPACKLIBS = @LAPACKLIBS@ +LAPACKLIBDIRS = @LAPACKLIBDIRS@ ################################################################## ## BLAS Library Flags ################################################################## -HYPRE_BLAS_SRC_DIR = @HYPRE_BLAS_SRC_DIR@ -HYPRE_BLAS_FILES = @HYPRE_BLAS_FILES@ -BLASLIBS = @BLASLIBS@ -BLASLIBDIRS = @BLASLIBDIRS@ +BLASLIBS = @BLASLIBS@ +BLASLIBDIRS = @BLASLIBDIRS@ ################################################################## ## MPI options @@ -115,17 +116,49 @@ MPIFLAGS = @MPIFLAGS@ ################################################################## +## NVCC options +################################################################## +HYPRE_NVCC_MAKEFILE = @HYPRE_NVCC_MAKEFILE@ + +################################################################## ## Caliper options ################################################################## CALIPER_INCLUDE = @CALIPER_INCLUDE@ CALIPER_LIBS = @CALIPER_LIBS@ ################################################################## +## SuperLU options +################################################################## +SUPERLU_INCLUDE = @SUPERLU_INCLUDE@ +SUPERLU_LIBS = @SUPERLU_LIBS@ + +################################################################## +## DsuperLU options +################################################################## +DSUPERLU_INCLUDE = @DSUPERLU_INCLUDE@ +DSUPERLU_LIBS = @DSUPERLU_LIBS@ + +################################################################## ## FEI options ################################################################## HYPRE_FEI_SRC_DIR = @HYPRE_FEI_SRC_DIR@ HYPRE_FEI_BASE_DIR = @HYPRE_FEI_BASE_DIR@ HYPRE_FEI_SUBDIRS = @HYPRE_FEI_SUBDIRS@ -HYPRE_FEI_SUPERLU_FILES = @HYPRE_FEI_SUPERLU_FILES@ HYPRE_FEI_HYPRE_FILES = @HYPRE_FEI_HYPRE_FILES@ HYPRE_FEI_FEMLI_FILES = @HYPRE_FEI_FEMLI_FILES@ + +################################################################## +## RAJA options +################################################################## +HYPRE_RAJA_LIB_DIR = @HYPRE_RAJA_LIB_DIR@ +HYPRE_RAJA_INCLUDE = @HYPRE_RAJA_INCLUDE@ +HYPRE_RAJA_LIB = @HYPRE_RAJA_LIB@ + +################################################################## +## kokkos options +################################################################## +HYPRE_KOKKOS_SRC_DIR = @HYPRE_KOKKOS_SRC_DIR@ +HYPRE_KOKKOS_LIB_DIR = @HYPRE_KOKKOS_LIB_DIR@ +HYPRE_KOKKOS_INCLUDE = @HYPRE_KOKKOS_INCLUDE@ +HYPRE_KOKKOS_LIB = @HYPRE_KOKKOS_LIB@ +@HYPRE_KOKKOS_INC_FILE@ diff -Nru hypre-2.11.2/src/configure hypre-2.13.0/src/configure --- hypre-2.11.2/src/configure 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/configure 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,7 @@ #! /bin/sh # From configure.in Id. # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for hypre 2.11.2. +# Generated by GNU Autoconf 2.69 for hypre 2.13.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -586,8 +586,8 @@ # Identity of this package. PACKAGE_NAME='hypre' PACKAGE_TARNAME='hypre' -PACKAGE_VERSION='2.11.2' -PACKAGE_STRING='hypre 2.11.2' +PACKAGE_VERSION='2.13.0' +PACKAGE_STRING='hypre 2.13.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -596,17 +596,30 @@ ac_subst_vars='LTLIBOBJS LIBOBJS AR +DSUPERLU_LIBS +DSUPERLU_INCLUDE +SUPERLU_LIBS +SUPERLU_INCLUDE CALIPER_LIBS CALIPER_INCLUDE +HYPRE_NVCC_MAKEFILE +NVCCLIBS +NVCCFLAGS +KOKKOS_LIBS +HYPRE_KOKKOS_LIB +HYPRE_KOKKOS_INC_FILE +HYPRE_KOKKOS_INCLUDE +HYPRE_KOKKOS_LIB_DIR +HYPRE_KOKKOS_SRC_DIR +RAJA_LIBS +HYPRE_RAJA_LIB +HYPRE_RAJA_INCLUDE +HYPRE_RAJA_LIB_DIR LAPACKLIBS LAPACKLIBDIRS -HYPRE_LAPACK_FILES -HYPRE_LAPACK_SRC_DIR BLASLIBS BLASLIBDIRS -HYPRE_BLAS_FILES -HYPRE_BLAS_SRC_DIR -HYPRE_FEI_SUPERLU_FILES +HYPRE_KOKKOS_PATH HYPRE_FEI_FEMLI_FILES HYPRE_FEI_HYPRE_FILES HYPRE_FEI_SUBDIRS @@ -653,6 +666,7 @@ CFLAGS RANLIB SET_MAKE +CUDACXX FC CXX CC @@ -714,6 +728,8 @@ enable_debug enable_shared enable_bigint +enable_single +enable_longdouble enable_complex enable_maxdim enable_persistent @@ -721,6 +737,7 @@ with_no_global_partition enable_global_partition enable_fortran +enable_unified_memory with_LD with_LDFLAGS with_extra_incpath @@ -744,10 +761,27 @@ with_print_errors with_timing with_openmp -with_fei with_superlu +with_superlu_include +with_superlu_lib +with_dsuperlu +with_dsuperlu_include +with_dsuperlu_lib +with_fei with_mli with_MPI +with_cuda +with_raja +with_kokkos +with_raja_include +with_raja_lib +with_raja_libs +with_raja_lib_dirs +with_kokkos_include +with_kokkos_lib +with_kokkos_libs +with_kokkos_dirs +with_nvcc with_caliper with_caliper_include with_caliper_lib @@ -1308,7 +1342,7 @@ # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures hypre 2.11.2 to adapt to many kinds of systems. +\`configure' configures hypre 2.13.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1373,7 +1407,7 @@ if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of hypre 2.11.2:";; + short | recursive ) echo "Configuration of hypre 2.13.0:";; esac cat <<\_ACEOF @@ -1384,6 +1418,8 @@ --enable-debug Set compiler flags for debugging. --enable-shared Build shared libraries (default is NO). --enable-bigint Use long long int for HYPRE_Int (default is NO). + --enable-single Use single precision values (default is NO). + --enable-longdouble Use long double precision values (default is NO). --enable-complex Use complex values (default is NO). --enable-maxdim=MAXDIM Change max dimension size to MAXDIM (default is 3). Currently must be at least 3. @@ -1393,6 +1429,8 @@ --enable-global-partition Use global partitioning (default is NO). --enable-fortran Require a working Fortran compiler (default is YES). + --enable-unified-memory Use unified memory for allocating the memory + (default is NO). Optional Packages: --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] @@ -1476,11 +1514,61 @@ --with-timing Use HYPRE timing routines. --with-openmp Use OpenMP. This may affect which compiler is chosen. + --with-superlu Use external SuperLU library. + --with-superlu-include=DIR + Directory where SuperLU is installed. + --with-superlu-lib=LIBS LIBS is space-separated linkable list (enclosed in + quotes) of libraries needed for SuperLU. OK to use + -L and -l flags in the list + --with-dsuperlu Use external DSuperLU library. + --with-dsuperlu-include=DIR + Directory where DSuperLU is installed. + --with-dsuperlu-lib=LIBS + LIBS is space-separated linkable list (enclosed in + quotes) of libraries needed for DSuperLU. OK to use + -L and -l flags in the list --with-fei Use internal FEI routines. - --with-superlu Use internal SuperLU routines. --with-mli Use MLI --with-MPI DEFAULT: Compile with MPI. Selecting --without-MPI may affect which compiler is chosen. + --with-cuda Use CUDA. Require cuda-8.0 or higher (default is + NO). + --with-raja Use RAJA. Require RAJA package to be compiled + properly (default is NO). + --with-kokkos Use Kokkos. Require kokkos package to be compiled + properly(default is NO). + --with-raja-include=DIR User specifies that RAJA/*.h is in DIR. The options + --with-raja-include --with-raja-libs and + --with-raja-lib-dirs must be used together. + --with-raja-lib=LIBS LIBS is space-separated linkable list (enclosed in + quotes) of libraries needed for RAJA. OK to use -L + and -l flags in the list + --with-raja-libs=LIBS LIBS is space-separated list (enclosed in quotes) of + libraries needed for RAJA (base name only). The + options --with-raja-libs and --with-raja-lib-dirs + must be used together. + --with-raja-lib-dirs=DIRS + DIRS is space-separated list (enclosed in quotes) of + directories containing the libraries specified by + --with-raja-libs, e.g "usr/lib /usr/local/lib". The + options --with-raja-libs and --raja-blas-lib-dirs + must be used together. + --with-kokkos-include=DIR + User specifies that KOKKOS headers is in DIR. The + options --with-kokkos-include --with-kokkos-libs and + --with-kokkos-dirs must be used together. + --with-kokkos-lib=LIBS LIBS is space-separated linkable list (enclosed in + quotes) of libraries needed for KOKKOS. OK to use -L + and -l flags in the list + --with-kokkos-libs=LIBS LIBS is space-separated list (enclosed in quotes) of + libraries needed for KOKKOS (base name only). The + options --with-kokkos-libs and --with-kokkos-dirs + must be used together. + --with-kokkos-dirs=DIRS DIRS is space-separated list (enclosed in quotes) of + directories containing Makefile.kokkos. The options + --with-kokkos-libs and --with-kokkos-dirs must be + used together. + --with-nvcc Use NVCC compiler (default is NO). --with-caliper Use Caliper instrumentation (default is NO). --with-caliper-include=DIR Directory where Caliper is installed. @@ -1570,7 +1658,7 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -hypre configure 2.11.2 +hypre configure 2.13.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -1867,7 +1955,7 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by hypre $as_me 2.11.2, which was +It was created by hypre $as_me 2.13.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2253,8 +2341,8 @@ HYPRE_NAME="hypre" -HYPRE_VERSION="2.11.2" -HYPRE_DATE="2017/03/13" +HYPRE_VERSION="2.13.0" +HYPRE_DATE="2017/10/20" HYPRE_TIME="00:00:00" HYPRE_BUGS="hypre-support@llnl.gov" HYPRE_SRCDIR="`pwd`" @@ -2306,18 +2394,26 @@ hypre_user_chose_blas=no hypre_user_chose_lapack=no hypre_user_chose_fei=no +hypre_user_chose_cuda=no +hypre_user_chose_raja=no +hypre_user_chose_kokkos=no hypre_using_c=yes hypre_using_cxx=yes hypre_using_mpi=yes +hypre_using_superlu=no +hypre_using_dsuperlu=no + hypre_using_fei=yes -hypre_using_superlu=yes -hypre_using_mli=yes +hypre_using_mli=no hypre_using_openmp=no hypre_using_insure=no +hypre_using_cuda=no +hypre_using_gpu=no +hypre_using_um=no hypre_using_caliper=no hypre_user_gave_caliper_lib=no @@ -2457,6 +2553,44 @@ fi +# Check whether --enable-single was given. +if test "${enable_single+set}" = set; then : + enableval=$enable_single; case "${enableval}" in + yes) hypre_using_fei=no + hypre_using_single=yes ;; + no) hypre_using_single=no ;; + *) as_fn_error $? "Bad value ${enableval} for --enable-single" "$LINENO" 5 ;; + esac +else + hypre_using_single=no + +fi + +if test "$hypre_using_single" = "yes" +then + $as_echo "#define HYPRE_SINGLE 1" >>confdefs.h + +fi + +# Check whether --enable-longdouble was given. +if test "${enable_longdouble+set}" = set; then : + enableval=$enable_longdouble; case "${enableval}" in + yes) hypre_using_fei=no + hypre_using_longdouble=yes ;; + no) hypre_using_longdouble=no ;; + *) as_fn_error $? "Bad value ${enableval} for --enable-longdouble" "$LINENO" 5 ;; + esac +else + hypre_using_longdouble=no + +fi + +if test "$hypre_using_longdouble" = "yes" +then + $as_echo "#define HYPRE_LONG_DOUBLE 1" >>confdefs.h + +fi + # Check whether --enable-complex was given. if test "${enable_complex+set}" = set; then : enableval=$enable_complex; case "${enableval}" in @@ -2562,6 +2696,20 @@ fi +# Check whether --enable-unified-memory was given. +if test "${enable_unified_memory+set}" = set; then : + enableval=$enable_unified_memory; case "${enableval}" in + yes) hypre_using_um=yes ;; + no) hypre_using_um=no ;; + *) hypre_using_um=no ;; + esac +else + hypre_using_um=no + +fi + + + if test "x$CC" = "x" then hypre_user_chose_ccompilers=no @@ -2862,6 +3010,7 @@ + # Check whether --with-MPI-include was given. if test "${with_MPI_include+set}" = set; then : withval=$with_MPI_include; for mpi_dir in $withval; do @@ -2911,6 +3060,7 @@ + # Check whether --with-blas-lib was given. if test "${with_blas_lib+set}" = set; then : withval=$with_blas_lib; for blas_lib in $withval; do @@ -2946,6 +3096,7 @@ + # Check whether --with-lapack-lib was given. if test "${with_lapack_lib+set}" = set; then : withval=$with_lapack_lib; for lapack_lib in $withval; do @@ -3092,10 +3243,7 @@ # Check whether --with-openmp was given. if test "${with_openmp+set}" = set; then : withval=$with_openmp; case "${withval}" in - yes) hypre_using_openmp=yes - -$as_echo "#define HYPRE_USING_OPENMP 1" >>confdefs.h - ;; + yes) hypre_using_openmp=yes;; no) hypre_using_openmp=no ;; esac else @@ -3105,16 +3253,6 @@ -# Check whether --with-fei was given. -if test "${with_fei+set}" = set; then : - withval=$with_fei; case "${withval}" in - no) hypre_using_fei=no ;; - *) hypre_using_fei=yes ;; - esac - -fi - - # Check whether --with-superlu was given. if test "${with_superlu+set}" = set; then : @@ -3126,6 +3264,83 @@ fi +if test "x$with_superlu" = "xyes"; then : + +$as_echo "#define HAVE_SUPERLU 1" >>confdefs.h + +fi + + +# Check whether --with-superlu-include was given. +if test "${with_superlu_include+set}" = set; then : + withval=$with_superlu_include; for superlu_inc_dir in $withval; do + SUPERLU_INCLUDE="-I$superlu_inc_dir $SUPERLU_INCLUDE" + done + +fi + + + +# Check whether --with-superlu-lib was given. +if test "${with_superlu_lib+set}" = set; then : + withval=$with_superlu_lib; for superlu_lib in $withval; do + SUPERLU_LIBS="$SUPERLU_LIBS $superlu_lib" + done + +fi + + + + +# Check whether --with-dsuperlu was given. +if test "${with_dsuperlu+set}" = set; then : + withval=$with_dsuperlu; case "${withval}" in + no) hypre_using_dsuperlu=no ;; + *) hypre_using_dsuperlu=yes ;; + esac + +fi + + +if test "x$with_dsuperlu" = "xyes"; then : + +$as_echo "#define HAVE_DSUPERLU 1" >>confdefs.h + +fi + + +# Check whether --with-dsuperlu-include was given. +if test "${with_dsuperlu_include+set}" = set; then : + withval=$with_dsuperlu_include; for dsuperlu_inc_dir in $withval; do + DSUPERLU_INCLUDE="-I$dsuperlu_inc_dir $DSUPERLU_INCLUDE" + done + +fi + + + +# Check whether --with-dsuperlu-lib was given. +if test "${with_dsuperlu_lib+set}" = set; then : + withval=$with_dsuperlu_lib; for dsuperlu_lib in $withval; do + DSUPERLU_LIBS="$DSUPERLU_LIBS $dsuperlu_lib" + done + +fi + + + + +# Check whether --with-fei was given. +if test "${with_fei+set}" = set; then : + withval=$with_fei; case "${withval}" in + no) hypre_using_fei=no ;; + *) hypre_using_fei=yes ;; + esac + +fi + + + # Check whether --with-mli was given. if test "${with_mli+set}" = set; then : @@ -3138,6 +3353,7 @@ + # Check whether --with-MPI was given. if test "${with_MPI+set}" = set; then : withval=$with_MPI; case "$withval" in @@ -3149,6 +3365,153 @@ + +# Check whether --with-cuda was given. +if test "${with_cuda+set}" = set; then : + withval=$with_cuda; case "$withval" in + yes) hypre_user_chose_cuda=yes + hypre_using_cuda=yes ;; + no) hypre_using_cuda=no ;; + *) hypre_using_cuda=no ;; + esac +else + hypre_using_cuda=no + +fi + + + + +# Check whether --with-raja was given. +if test "${with_raja+set}" = set; then : + withval=$with_raja; case "$withval" in + yes) hypre_user_chose_raja=yes;; + no) hypre_user_chose_raja=no ;; + *) hypre_user_chose_raja=no ;; + esac +else + hypre_using_raja=no + +fi + + + + +# Check whether --with-kokkos was given. +if test "${with_kokkos+set}" = set; then : + withval=$with_kokkos; case "$withval" in + yes) hypre_user_chose_kokkos=yes ;; + no) hypre_user_chose_kokkos=no ;; + *) hypre_user_chose_kokkos=no ;; + esac + +fi + + + +# Check whether --with-raja-include was given. +if test "${with_raja_include+set}" = set; then : + withval=$with_raja_include; for raja_dir in $withval; do + HYPRE_RAJA_INCLUDE="$HYPRE_RAJA_INCLUDE -I$raja_dir" + done; + hypre_user_chose_raja=yes + +fi + + + +# Check whether --with-raja-lib was given. +if test "${with_raja_lib+set}" = set; then : + withval=$with_raja_lib; for raja_lib in $withval; do + HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB $raja_lib" + done; +hypre_user_chose_raja=yes + +fi + + + +# Check whether --with-raja-libs was given. +if test "${with_raja_libs+set}" = set; then : + withval=$with_raja_libs; for raja_lib in $withval; do + HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB -l$raja_lib" + done; +hypre_user_chose_raja=yes + +fi + + + +# Check whether --with-raja-lib-dirs was given. +if test "${with_raja_lib_dirs+set}" = set; then : + withval=$with_raja_lib_dirs; for raja_lib_dir in $withval; do + HYPRE_RAJA_LIB_DIR="-L$raja_lib_dir $HYPRE_RAJA_LIB_DIR" + done; + hypre_user_chose_raja=yes + +fi + + + +# Check whether --with-kokkos-include was given. +if test "${with_kokkos_include+set}" = set; then : + withval=$with_kokkos_include; for kokkos_dir in $withval; do +HYPRE_KOKKOS_INCLUDE="$HYPRE_KOKKOS_INCLUDE -I$kokkos_dir" +done; +hypre_user_chose_kokkos=yes + +fi + + + +# Check whether --with-kokkos-lib was given. +if test "${with_kokkos_lib+set}" = set; then : + withval=$with_kokkos_lib; for kokkos_lib in $withval; do + HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB $kokkos_lib" + done; +hypre_user_chose_kokkos=yes + +fi + + + +# Check whether --with-kokkos-libs was given. +if test "${with_kokkos_libs+set}" = set; then : + withval=$with_kokkos_libs; for kokkos_lib in $withval; do + HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB -l$kokkos_lib" + done; +hypre_user_chose_kokkos=yes + +fi + + + +# Check whether --with-kokkos-dirs was given. +if test "${with_kokkos_dirs+set}" = set; then : + withval=$with_kokkos_dirs; for kokkos_lib_dir in $withval; do + HYPRE_KOKKOS_SRC_DIR="$kokkos_lib_dir" + done; +hypre_user_chose_kokkos=yes + +fi + + + +# Check whether --with-nvcc was given. +if test "${with_nvcc+set}" = set; then : + withval=$with_nvcc; case "${withval}" in + yes) hypre_using_nvcc=yes ;; + no) hypre_using_nvcc=no ;; + *) as_fn_error $? "Bad value ${withval} for --with-nvcc" "$LINENO" 5 ;; + esac +else + hypre_using_nvcc=no + +fi + + + + # Check whether --with-caliper was given. if test "${with_caliper+set}" = set; then : withval=$with_caliper; hypre_using_caliper=yes @@ -3185,13 +3548,285 @@ fi +if test "$hypre_using_cuda" = "yes" +then + hypre_using_fortran=no + for ac_prog in nvcc +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CXX"; then + ac_cv_prog_CXX="$CXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CXX=$ac_cv_prog_CXX +if test -n "$CXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5 +$as_echo "$CXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CXX" && break +done + + for ac_prog in nvcc +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CC" && break +done + + if test "$hypre_user_chose_cxxcompilers" = "no" + then + if test "$hypre_using_mpi" = "no" + then + if test "$hypre_using_openmp" = "yes" + then + for ac_prog in xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CUDACXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CUDACXX"; then + ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CUDACXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CUDACXX=$ac_cv_prog_CUDACXX +if test -n "$CUDACXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5 +$as_echo "$CUDACXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CUDACXX" && break +done + + else + for ac_prog in xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CUDACXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CUDACXX"; then + ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CUDACXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CUDACXX=$ac_cv_prog_CUDACXX +if test -n "$CUDACXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5 +$as_echo "$CUDACXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CUDACXX" && break +done + + fi + else + if test "$hypre_using_openmp" = "yes" + then + for ac_prog in mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CUDACXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CUDACXX"; then + ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CUDACXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CUDACXX=$ac_cv_prog_CUDACXX +if test -n "$CUDACXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5 +$as_echo "$CUDACXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CUDACXX" && break +done + + else + for ac_prog in mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CUDACXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CUDACXX"; then + ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CUDACXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CUDACXX=$ac_cv_prog_CUDACXX +if test -n "$CUDACXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5 +$as_echo "$CUDACXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CUDACXX" && break +done + + fi + fi + fi +fi + if test "$hypre_user_chose_ccompilers" = "no" then if test "$hypre_using_mpi" = "no" then if test "$hypre_using_openmp" = "yes" then - for ac_prog in xlc_r xlC_r icc icpc gcc g++ pgcc pgCC cc CC kcc KCC + for ac_prog in xlc_r xlC_r xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -3280,7 +3915,7 @@ else if test "$hypre_using_openmp" = "yes" then - for ac_prog in mpxlc mpixlc_r mpiicc mpicc mpipgcc + for ac_prog in mpxlc mpixlc_r mpixlc mpiicc mpigcc mpicc mpipgcc do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -3323,7 +3958,7 @@ done else - for ac_prog in mpxlc mpixlc mpiicc mpicc mpipgcc + for ac_prog in mpxlc mpixlc mpiicc mpigcc mpicc mpipgcc do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -3380,7 +4015,7 @@ then if test "$hypre_using_openmp" = "yes" then - for ac_prog in xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc + for ac_prog in xlC_r xlc_r xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -3469,7 +4104,7 @@ else if test "$hypre_using_openmp" = "yes" then - for ac_prog in mpxlC mpixlcxx_r mpiicpc mpiCC mpicxx mpipgCC + for ac_prog in mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -3512,7 +4147,7 @@ done else - for ac_prog in mpxlC mpixlcxx mpiicpc mpiCC mpicxx mpipgCC + for ac_prog in mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 @@ -6307,11 +6942,8 @@ if test "$hypre_using_global_partition" = "no" then - if test "$hypre_using_mpi" != "no" - then $as_echo "#define HYPRE_NO_GLOBAL_PARTITION 1" >>confdefs.h - fi fi if test "$hypre_user_chose_blas" = "yes" @@ -6521,8 +7153,6 @@ fi if test "$hypre_using_hypre_blas" = "yes" then - HYPRE_BLAS_SRC_DIR="$HYPRE_SRCDIR/blas" - HYPRE_BLAS_FILES="$HYPRE_SRCDIR/blas/*.o" BLASLIBDIRS="" BLASLIBS="" @@ -6716,8 +7346,6 @@ fi if test "$hypre_using_hypre_lapack" = "yes" then - HYPRE_LAPACK_SRC_DIR="$HYPRE_SRCDIR/lapack" - HYPRE_LAPACK_FILES="$HYPRE_SRCDIR/lapack/*.o" LAPACKLIBDIRS="" LAPACKLIBS="" @@ -6731,19 +7359,17 @@ HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv" HYPRE_FEI_SUBDIRS="fei-hypre" HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o" - if test "$hypre_using_superlu" = "yes" - then - HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS" - HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o" - else - HYPRE_FEI_SUPERLU_FILES= - fi + HYPRE_FEI_FEMLI_FILES= if test "$hypre_using_mli" = "yes" then - HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS" - HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o" - else - HYPRE_FEI_FEMLI_FILES= + if test "$hypre_using_superlu" = "yes" + then + HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS" + HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o" + +$as_echo "#define HAVE_MLI 1" >>confdefs.h + + fi fi if test "$hypre_user_chose_fei" = "no" then @@ -6802,7 +7428,6 @@ HYPRE_FEI_BASE_DIR= HYPRE_FEI_HYPRE_FILES= HYPRE_FEI_FEMLI_FILES= - HYPRE_FEI_SUPERLU_FILES= fi if test "$hypre_using_debug" = "yes" @@ -7232,6 +7857,187 @@ fi fi + +if test "$hypre_using_nvcc" = "yes" +then + +$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h + + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + CFLAGS="${CFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED " + CXXFLAGS="${CXXFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED " +else + NVCCFLAGS= + NVCCLIBS= + HYPRE_NVCC_MAKEFILE="Makefile.empty" +fi + +if test "$hypre_user_chose_raja" = "yes" +then + RAJA_LIBS=" $HYPRE_RAJA_LIB_DIR $HYPRE_RAJA_LIB " + if test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r" + then + CFLAGS+=" -+ " + fi + if test "$hypre_using_cuda" = "yes" + then + RAJAFLAGS=" -lRAJA " + LDFLAGS=" -ccbin=$CUDACXX -expt-extended-lambda -Xcompiler -fopenmp -arch compute_35 -lcudart -lcuda $RAJAFLAGS " + CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register -Xcompiler $RAJAFLAGS " + CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register $RAJAFLAGS " + if test "$hypre_using_um" != "yes" + then + CFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5 +$as_echo "$as_me: *******************************************************" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-raja and --with-cuda without unified memory." >&5 +$as_echo "$as_me: Configuring with --with-raja and --with-cuda without unified memory." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5 +$as_echo "$as_me: It only works for struct interface." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&5 +$as_echo "$as_me: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: to use the cuda feature for the whold package" >&5 +$as_echo "$as_me: to use the cuda feature for the whold package" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5 +$as_echo "$as_me: *******************************************************" >&6;} + else + CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + fi + hypre_user_chose_cuda=no + else + if test "$hypre_using_openmp" = "yes" + then + CC=${CXX} + CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP " + CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP " + hypre_using_openmp=no + else + CC=${CXX} + CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA " + CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA " + fi + fi +fi + +if test "$hypre_user_chose_kokkos" = "yes" +then + if test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r" + then + CFLAGS+=" -+ " + fi + if test "$hypre_using_cuda" = "yes" + then + LDFLAGS=" -ccbin=$CUDACXX -arch compute_35 -lcudart -lcuda" + HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos" + HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib" + CC=${CXX} + LINK_CC=$LINK_CXX + CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS " + CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS " + if test "$hypre_using_um" != "yes" + then + CFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 " + { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5 +$as_echo "$as_me: *******************************************************" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-kokkos and --with-cuda, but not with unified memory" >&5 +$as_echo "$as_me: Configuring with --with-kokkos and --with-cuda, but not with unified memory" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5 +$as_echo "$as_me: It only works for struct interface." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&5 +$as_echo "$as_me: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: to use the cuda feature for the whold package" >&5 +$as_echo "$as_me: to use the cuda feature for the whold package" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5 +$as_echo "$as_me: *******************************************************" >&6;} + else + CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + +$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h + + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + fi + hypre_user_chose_cuda=no + else + if test "$hypre_using_openmp" = "yes" + then + CC=${CXX} + CFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP" + CXXFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP" + HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos" + HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib" + hypre_using_openmp=no + else + CC=${CXX} + CFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS " + CXXFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS " + HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos" + HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib" + LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + +$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h + + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + fi + fi + KOKKOS_LIBS=" $HYPRE_KOKKOS_LIB_DIR $HYPRE_KOKKOS_LIB " +fi +if test "$hypre_user_chose_cuda" = "yes" +then + LDFLAGS+=" -ccbin=$CUDACXX -arch compute_35 " + CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA " + CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA " + if test "$hypre_using_um" != "yes" + then + CFLAGS+=" -DHYPRE_MEMORY_GPU" + CXXFLAGS+=" -DHYPRE_MEMORY_GPU" + { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5 +$as_echo "$as_me: *******************************************************" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-cuda=yes without unified memory." >&5 +$as_echo "$as_me: Configuring with --with-cuda=yes without unified memory." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5 +$as_echo "$as_me: It only works for struct interface." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Use --enable-unified-memory to compile with unified memory." >&5 +$as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5 +$as_echo "$as_me: *******************************************************" >&6;} + else + CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " + LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + +$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h + + NVCCFLAGS="-O3 -arch=sm_60 -ccbin=xlc -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include" + NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt" + HYPRE_NVCC_MAKEFILE="Makefile.nvcc" + fi +fi + +if test "$hypre_using_um" = "yes" +then + LDFLAGS+=" -lcudart -lcuda " + CFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include " + CXXFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include " +fi + +if test "$hypre_using_openmp" = "yes" +then + +$as_echo "#define HYPRE_USING_OPENMP 1" >>confdefs.h + +fi + HYPRE_INSTALLDIR="${prefix}" HYPRE_LIBINSTALL="${libdir}" HYPRE_INCINSTALL="${includedir}" @@ -7410,6 +8216,24 @@ + + + + + + + + + + + + + + + + + + ac_config_files="$ac_config_files config/Makefile.config" @@ -7919,7 +8743,7 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by hypre $as_me 2.11.2, which was +This file was extended by hypre $as_me 2.13.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7981,7 +8805,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -hypre config.status 2.11.2 +hypre config.status 2.13.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/blas_dh.c hypre-2.13.0/src/distributed_ls/Euclid/blas_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/blas_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/blas_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -119,7 +119,7 @@ } if (np_dh > 1) { - hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm_dh); + hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm_dh); } else { result = local_result; } @@ -145,7 +145,7 @@ } if (np_dh > 1) { - hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm_dh); + hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm_dh); } else { result = local_result; } diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Euclid_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Euclid_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/Euclid_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/Euclid_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -75,7 +75,7 @@ strcpy(ctx->krylovMethod, "bicgstab"); ctx->maxIts = 200; ctx->rtol = 1e-5; - ctx->atol = 1e-50; + ctx->atol = _ATOL_; ctx->its = 0; ctx->itsTotal = 0; ctx->setupCount = 0; @@ -132,11 +132,15 @@ void Euclid_dhSetup(Euclid_dh ctx) { START_FUNC_DH - HYPRE_Int m, n, beg_row; + HYPRE_Int m, n, beg_row, ierr; HYPRE_Real t1; bool isSetup = ctx->isSetup; bool bj = false; + /* clear error flag if previously setup - DOK */ + if(isSetup) + ierr = HYPRE_GetError(); HYPRE_ClearAllErrors(); + /*---------------------------------------------------- * If Euclid was previously setup, print summary of * what happened during previous setup/solve @@ -169,7 +173,9 @@ if (ctx->A == NULL) { SET_V_ERROR("must set ctx->A before calling init"); } + EuclidGetDimensions(ctx->A, &beg_row, &m, &n); CHECK_V_ERROR; + ctx->m = m; ctx->n = n; @@ -293,6 +299,9 @@ ctx->isSetup = true; + /* setup done. Reset error flag - DOK*/ + hypre_error_flag |= ierr; + END_FUNC_DH } @@ -410,7 +419,7 @@ bufGlobal[1] = bufLocal[1]; bufGlobal[2] = bufLocal[2]; } else { - hypre_MPI_Reduce(bufLocal, bufGlobal, 3, hypre_MPI_DOUBLE, hypre_MPI_SUM, 0, comm_dh); + hypre_MPI_Reduce(bufLocal, bufGlobal, 3, hypre_MPI_REAL, hypre_MPI_SUM, 0, comm_dh); } if (myid_dh == 0) { @@ -888,7 +897,7 @@ HYPRE_Real bufOUT[TIMING_BINS]; memcpy(bufOUT, ctx->timing, TIMING_BINS*sizeof(HYPRE_Real)); - hypre_MPI_Reduce(bufOUT, ctx->timing, TIMING_BINS, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh); + hypre_MPI_Reduce(bufOUT, ctx->timing, TIMING_BINS, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh); } ctx->timingsWereReduced = true; diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/ExternalRows_dh.c hypre-2.13.0/src/distributed_ls/Euclid/ExternalRows_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/ExternalRows_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/ExternalRows_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -317,7 +317,7 @@ nz = rcv_nz_counts[i]; hypre_MPI_Irecv(extRowCval+offset, nz, HYPRE_MPI_INT, nabor, CVAL_TAG, comm_dh, er->req1+i); hypre_MPI_Irecv(extRowFill+offset, nz, HYPRE_MPI_INT, nabor, FILL_TAG, comm_dh, er->req2+i); - hypre_MPI_Irecv(extRowAval+offset, nz, hypre_MPI_DOUBLE, nabor, AVAL_TAG, comm_dh, er->req3+i); + hypre_MPI_Irecv(extRowAval+offset, nz, hypre_MPI_REAL, nabor, AVAL_TAG, comm_dh, er->req3+i); offset += nz; } @@ -537,7 +537,7 @@ HYPRE_Int nabor = hiNabors[i]; hypre_MPI_Isend(cvalSend, nz, HYPRE_MPI_INT, nabor, CVAL_TAG, comm_dh, er->cval_req+i); hypre_MPI_Isend(fillSend, nz, HYPRE_MPI_INT, nabor, FILL_TAG, comm_dh, er->fill_req+i); - hypre_MPI_Isend(avalSend, nz, hypre_MPI_DOUBLE, nabor, AVAL_TAG, comm_dh, er->aval_req+i); + hypre_MPI_Isend(avalSend, nz, hypre_MPI_REAL, nabor, AVAL_TAG, comm_dh, er->aval_req+i); } END_FUNC_DH } diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Factor_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Factor_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/Factor_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/Factor_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -379,7 +379,7 @@ hypre_MPI_Request_free(&request); /* set up persistent comms for receiving the values from this_pe */ - hypre_MPI_Recv_init(recvBuf+i, j-i, hypre_MPI_DOUBLE, this_pe, 555, + hypre_MPI_Recv_init(recvBuf+i, j-i, hypre_MPI_REAL, this_pe, 555, comm_dh, req+num_recv); ++num_recv; } @@ -457,7 +457,7 @@ ++count; /* Set up the send */ - hypre_MPI_Send_init(sendBuf, inlist[i], hypre_MPI_DOUBLE, i, 555, comm_dh, sendReq); + hypre_MPI_Send_init(sendBuf, inlist[i], hypre_MPI_REAL, i, 555, comm_dh, sendReq); } } @@ -1128,7 +1128,7 @@ if (np_dh == 1) { minGlobal = min; } else { - hypre_MPI_Reduce(&min, &minGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MIN, 0, comm_dh); + hypre_MPI_Reduce(&min, &minGlobal, 1, hypre_MPI_REAL, hypre_MPI_MIN, 0, comm_dh); } if (minGlobal == 0) { @@ -1155,7 +1155,7 @@ if (np_dh == 1) { maxGlobal = max; } else { - hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh); + hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh); } END_FUNC_VAL(maxGlobal) } @@ -1185,7 +1185,7 @@ if (np_dh == 1) { maxGlobal = max; } else { - hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh); + hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh); } END_FUNC_VAL(maxGlobal) } diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/globalObjects.c hypre-2.13.0/src/distributed_ls/Euclid/globalObjects.c --- hypre-2.11.2/src/distributed_ls/Euclid/globalObjects.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/globalObjects.c 2017-10-20 17:42:22.000000000 +0000 @@ -30,7 +30,10 @@ TimeLog_dh tlog_dh = NULL; /* internal timing functionality */ Mem_dh mem_dh = NULL; /* memory management */ FILE *logFile = NULL; +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA) +#else char msgBuf_dh[MSG_BUF_SIZE_DH]; /* for internal use */ +#endif HYPRE_Int np_dh = 1; /* number of processors and subdomains */ HYPRE_Int myid_dh = 0; /* rank of this processor (and subdomain) */ MPI_Comm comm_dh = 0; diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/_hypre_Euclid.h hypre-2.13.0/src/distributed_ls/Euclid/_hypre_Euclid.h --- hypre-2.11.2/src/distributed_ls/Euclid/_hypre_Euclid.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/_hypre_Euclid.h 2017-10-20 17:42:22.000000000 +0000 @@ -198,8 +198,14 @@ #define FABS(a) ((a) < 0 ? -(a) : a) #endif -/* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */ +#ifdef HYPRE_SINGLE +#define _ATOL_ 1.0e-16 /* used to compute absolute tolerance for Euclid's internal Krylov solvers */ +#define _MATLAB_ZERO_ 1e-30 /* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */ +#else // default +#define _ATOL_ 1.0e-50 #define _MATLAB_ZERO_ 1e-100 +#endif + /*---------------------------------------------------------------------- @@ -481,7 +487,11 @@ #endif #define MSG_BUF_SIZE_DH MAX(1024, hypre_MPI_MAX_ERROR_STRING) +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA) +static char msgBuf_dh[MSG_BUF_SIZE_DH]; +#else extern char msgBuf_dh[MSG_BUF_SIZE_DH]; +#endif /* Each processor (may) open a logfile. * The bools are switches for controlling the amount of informational diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/macros_dh.h hypre-2.13.0/src/distributed_ls/Euclid/macros_dh.h --- hypre-2.11.2/src/distributed_ls/Euclid/macros_dh.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/macros_dh.h 2017-10-20 17:42:22.000000000 +0000 @@ -34,7 +34,11 @@ #endif /* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */ +#ifdef HYPRE_SINGLE +#define _MATLAB_ZERO_ 1e-30 +#else // default #define _MATLAB_ZERO_ 1e-100 +#endif /*---------------------------------------------------------------------- diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Mat_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Mat_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/Mat_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/Mat_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -251,7 +251,7 @@ /* Count of number of number of indices needed from this_pe */ outlist[this_pe] = j-i; - ierr = hypre_MPI_Recv_init(&mat->recvbuf[i+m], j-i, hypre_MPI_DOUBLE, this_pe, 555, + ierr = hypre_MPI_Recv_init(&mat->recvbuf[i+m], j-i, hypre_MPI_REAL, this_pe, 555, comm_dh, &mat->recv_req[mat->num_recv]); CHECK_MPI_V_ERROR(ierr); mat->num_recv++; @@ -289,7 +289,7 @@ ierr = hypre_MPI_Irecv(&mat->sendind[j], inlist[i], HYPRE_MPI_INT, i, 444, comm_dh, &requests[mat->num_send]); CHECK_MPI_V_ERROR(ierr); /* Set up the send */ - ierr = hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 555, comm_dh, + ierr = hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 555, comm_dh, &mat->send_req[mat->num_send]); CHECK_MPI_V_ERROR(ierr); mat->num_send++; @@ -601,8 +601,8 @@ if (mat->time[MATVEC_MPI_TIME]) { mat->time[MATVEC_RATIO] = mat->time[MATVEC_TIME] / mat->time[MATVEC_MPI_TIME]; } - hypre_MPI_Allreduce(mat->time, mat->time_min, MAT_DH_BINS, hypre_MPI_DOUBLE, hypre_MPI_MIN, comm_dh); - hypre_MPI_Allreduce(mat->time, mat->time_max, MAT_DH_BINS, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm_dh); + hypre_MPI_Allreduce(mat->time, mat->time_min, MAT_DH_BINS, hypre_MPI_REAL, hypre_MPI_MIN, comm_dh); + hypre_MPI_Allreduce(mat->time, mat->time_max, MAT_DH_BINS, hypre_MPI_REAL, hypre_MPI_MAX, comm_dh); END_FUNC_DH } diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/mat_dh_private.c hypre-2.13.0/src/distributed_ls/Euclid/mat_dh_private.c --- hypre-2.11.2/src/distributed_ls/Euclid/mat_dh_private.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/mat_dh_private.c 2017-10-20 17:42:22.000000000 +0000 @@ -1082,7 +1082,7 @@ } hypre_MPI_Isend(cval+rp[i], count, HYPRE_MPI_INT, owner, CVAL_TAG, comm_dh, send_req+2*i); - hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_DOUBLE, owner, AVAL_TAG, comm_dh, send_req+2*i+1); + hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_REAL, owner, AVAL_TAG, comm_dh, send_req+2*i+1); } } @@ -1105,7 +1105,7 @@ } hypre_MPI_Irecv(cval+rp[i], count, HYPRE_MPI_INT, 0, CVAL_TAG, comm_dh, rcv_req+2*i); - hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_DOUBLE, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1); + hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_REAL, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1); } } @@ -1195,7 +1195,7 @@ } hypre_MPI_Isend(cval+rp[i], count, HYPRE_MPI_INT, owner, CVAL_TAG, comm_dh, send_req+2*i); - hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_DOUBLE, owner, AVAL_TAG, comm_dh, send_req+2*i+1); + hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_REAL, owner, AVAL_TAG, comm_dh, send_req+2*i+1); } } @@ -1218,7 +1218,7 @@ } hypre_MPI_Irecv(cval+rp[i], count, HYPRE_MPI_INT, 0, CVAL_TAG, comm_dh, rcv_req+2*i); - hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_DOUBLE, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1); + hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_REAL, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1); } } diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Numbering_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Numbering_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/Numbering_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/Numbering_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -111,8 +111,9 @@ space. The global_to_local hash table may also need to be enlarged, but the hash object will take care of that. */ + /* RL : why ``m+num_ext'' instead of ``num_ext+1'' ??? */ if (m+num_ext >= size) { - HYPRE_Int newSize = size*1.5; /* heuristic */ + HYPRE_Int newSize = hypre_max(m+num_ext+1, size*1.5); /* heuristic */ HYPRE_Int *tmp = (HYPRE_Int*)MALLOC_DH(newSize*sizeof(HYPRE_Int)); CHECK_V_ERROR; memcpy(tmp, idx_ext, size*sizeof(size)); FREE_DH(idx_ext); CHECK_V_ERROR; diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/TimeLog_dh.c hypre-2.13.0/src/distributed_ls/Euclid/TimeLog_dh.c --- hypre-2.11.2/src/distributed_ls/Euclid/TimeLog_dh.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/Euclid/TimeLog_dh.c 2017-10-20 17:42:22.000000000 +0000 @@ -121,8 +121,8 @@ hypre_sprintf(t->desc[t->last], "========== totals, and reset ==========\n"); t->last += 1; - hypre_MPI_Allreduce(t->time, timeMax, t->last, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm_dh); - hypre_MPI_Allreduce(t->time, timeMin, t->last, hypre_MPI_DOUBLE, hypre_MPI_MIN, comm_dh); + hypre_MPI_Allreduce(t->time, timeMax, t->last, hypre_MPI_REAL, hypre_MPI_MAX, comm_dh); + hypre_MPI_Allreduce(t->time, timeMin, t->last, hypre_MPI_REAL, hypre_MPI_MIN, comm_dh); wasSummed = true; } diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/ConjGrad.c hypre-2.13.0/src/distributed_ls/ParaSails/ConjGrad.c --- hypre-2.11.2/src/distributed_ls/ParaSails/ConjGrad.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/ConjGrad.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,9 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - /****************************************************************************** * * ConjGrad - Preconditioned conjugate gradient algorithm using the @@ -24,20 +21,16 @@ #include "Common.h" #include "Matrix.h" #include "ParaSails.h" - -HYPRE_Real hypre_F90_NAME_BLAS(ddot, DDOT)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(dcopy, DCOPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(dscal, DSCAL)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(daxpy, DAXPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *); +#include "_hypre_blas.h" static HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y, MPI_Comm comm) { HYPRE_Real local_result, result; HYPRE_Int one = 1; - local_result = hypre_F90_NAME_BLAS(ddot, DDOT)(&n, x, &one, y, &one); + local_result = hypre_ddot(&n, x, &one, y, &one); - hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm); + hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm); return result; } diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/DiagScale.c hypre-2.13.0/src/distributed_ls/ParaSails/DiagScale.c --- hypre-2.11.2/src/distributed_ls/ParaSails/DiagScale.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/DiagScale.c 2017-10-20 17:42:22.000000000 +0000 @@ -76,7 +76,7 @@ } /* Post receive for diagonal values */ - hypre_MPI_Irecv(&diags[i], j-i, hypre_MPI_DOUBLE, this_pe, DIAG_VALS_TAG, + hypre_MPI_Irecv(&diags[i], j-i, hypre_MPI_REAL, this_pe, DIAG_VALS_TAG, comm, &requests[*num_requests]); /* Request rows in reqind[i..j-1] */ @@ -130,7 +130,7 @@ sendbuf[j] = local_diags[recvbuf[j] - mat->beg_row]; /* Use ready-mode send, since receives already posted */ - hypre_MPI_Irsend(sendbuf, count, hypre_MPI_DOUBLE, source, + hypre_MPI_Irsend(sendbuf, count, hypre_MPI_REAL, source, DIAG_VALS_TAG, comm, &requests[i]); } } diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/driver.c hypre-2.13.0/src/distributed_ls/ParaSails/driver.c --- hypre-2.11.2/src/distributed_ls/ParaSails/driver.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/driver.c 2017-10-20 17:42:22.000000000 +0000 @@ -139,11 +139,11 @@ #endif } - hypre_MPI_Bcast(&threshg, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD); - hypre_MPI_Bcast(&thresh, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD); + hypre_MPI_Bcast(&threshg, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD); + hypre_MPI_Bcast(&thresh, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD); hypre_MPI_Bcast(&nlevels, 1, HYPRE_MPI_INT, 0, hypre_MPI_COMM_WORLD); - hypre_MPI_Bcast(&filter, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD); - hypre_MPI_Bcast(&loadbal, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD); + hypre_MPI_Bcast(&filter, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD); + hypre_MPI_Bcast(&loadbal, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD); if (nlevels < 0) break; @@ -220,9 +220,9 @@ time1 = hypre_MPI_Wtime(); solve_time = time1-time0; - hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, + hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, hypre_MPI_COMM_WORLD); - hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, + hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, hypre_MPI_COMM_WORLD); if (mype == 0) diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/FGmres.c hypre-2.13.0/src/distributed_ls/ParaSails/FGmres.c --- hypre-2.11.2/src/distributed_ls/ParaSails/FGmres.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/FGmres.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,9 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - /****************************************************************************** * * FlexGmres - Preconditioned flexible GMRES algorithm using the @@ -24,20 +21,16 @@ #include "Common.h" #include "Matrix.h" #include "ParaSails.h" - -HYPRE_Real hypre_F90_NAME_BLAS(ddot, DDOT)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(dcopy, DCOPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(dscal, DSCAL)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(daxpy, DAXPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *); +#include "_hypre_blas.h" static HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y, MPI_Comm comm) { HYPRE_Real local_result, result; HYPRE_Int one = 1; - local_result = hypre_F90_NAME_BLAS(ddot, DDOT)(&n, x, &one, y, &one); + local_result = hypre_ddot(&n, x, &one, y, &one); - hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm); + hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm); return result; } @@ -45,19 +38,19 @@ static void CopyVector(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y) { HYPRE_Int one = 1; - hypre_F90_NAME_BLAS(dcopy, DCOPY)(&n, x, &one, y, &one); + hypre_dcopy(&n, x, &one, y, &one); } static void ScaleVector(HYPRE_Int n, HYPRE_Real alpha, HYPRE_Real *x) { HYPRE_Int one = 1; - hypre_F90_NAME_BLAS(dscal, DSCAL)(&n, &alpha, x, &one); + hypre_dscal(&n, &alpha, x, &one); } static void Axpy(HYPRE_Int n, HYPRE_Real alpha, HYPRE_Real *x, HYPRE_Real *y) { HYPRE_Int one = 1; - hypre_F90_NAME_BLAS(daxpy, DAXPY)(&n, &alpha, x, &one, y, &one); + hypre_daxpy(&n, &alpha, x, &one, y, &one); } /* simulate 2-D arrays at the cost of some arithmetic */ diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/lapdriver.c hypre-2.13.0/src/distributed_ls/ParaSails/lapdriver.c --- hypre-2.11.2/src/distributed_ls/ParaSails/lapdriver.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/lapdriver.c 2017-10-20 17:42:22.000000000 +0000 @@ -304,9 +304,9 @@ ParaSailsStatsPattern(ps, A); ParaSailsStatsValues(ps, A); - hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, + hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, hypre_MPI_COMM_WORLD); - hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, + hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, hypre_MPI_COMM_WORLD); if (mype == 0) diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/LoadBal.c hypre-2.13.0/src/distributed_ls/ParaSails/LoadBal.c --- hypre-2.11.2/src/distributed_ls/ParaSails/LoadBal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/LoadBal.c 2017-10-20 17:42:22.000000000 +0000 @@ -57,7 +57,7 @@ cost = (HYPRE_Real *) malloc(npes * sizeof(HYPRE_Real)); - hypre_MPI_Allgather(&local_cost, 1, hypre_MPI_DOUBLE, cost, 1, hypre_MPI_DOUBLE, comm); + hypre_MPI_Allgather(&local_cost, 1, hypre_MPI_REAL, cost, 1, hypre_MPI_REAL, comm); /* Compute the average cost */ average = 0.0; @@ -284,7 +284,7 @@ bufferp += len; } - hypre_MPI_Isend(recip_data[i].buffer, buflen, hypre_MPI_DOUBLE, recip_data[i].pe, + hypre_MPI_Isend(recip_data[i].buffer, buflen, hypre_MPI_REAL, recip_data[i].pe, LOADBAL_REP_TAG, comm, &request[i]); MatrixDestroy(mat); @@ -311,10 +311,10 @@ { hypre_MPI_Probe(hypre_MPI_ANY_SOURCE, LOADBAL_REP_TAG, comm, &status); source = status.hypre_MPI_SOURCE; - hypre_MPI_Get_count(&status, hypre_MPI_DOUBLE, &count); + hypre_MPI_Get_count(&status, hypre_MPI_REAL, &count); buffer = (HYPRE_Real *) malloc(count * sizeof(HYPRE_Real)); - hypre_MPI_Recv(buffer, count, hypre_MPI_DOUBLE, source, LOADBAL_REP_TAG, + hypre_MPI_Recv(buffer, count, hypre_MPI_REAL, source, LOADBAL_REP_TAG, comm, &status); /* search for which entry in donor_data this message corresponds to */ diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/Makefile hypre-2.13.0/src/distributed_ls/ParaSails/Makefile --- hypre-2.11.2/src/distributed_ls/ParaSails/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,8 @@ -I../..\ -I$(srcdir)\ -I$(srcdir)/../..\ + -I$(srcdir)/../../blas\ + -I$(srcdir)/../../lapack\ -I$(srcdir)/../../utilities\ -I$(srcdir)/../../distributed_matrix\ ${CINCLUDES} diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/Matrix.c hypre-2.13.0/src/distributed_ls/ParaSails/Matrix.c --- hypre-2.11.2/src/distributed_ls/ParaSails/Matrix.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/Matrix.c 2017-10-20 17:42:22.000000000 +0000 @@ -532,7 +532,7 @@ if (mype != 0) { - hypre_MPI_Recv(rhs, num_local, hypre_MPI_DOUBLE, 0, 0, mat->comm, &status); + hypre_MPI_Recv(rhs, num_local, hypre_MPI_REAL, 0, 0, mat->comm, &status); return; } @@ -567,7 +567,7 @@ else hypre_fscanf(file, "%lf", &buffer[i]); - hypre_MPI_Send(buffer, num_local, hypre_MPI_DOUBLE, pe, 0, mat->comm); + hypre_MPI_Send(buffer, num_local, hypre_MPI_REAL, pe, 0, mat->comm); } free(buffer); @@ -614,10 +614,10 @@ /* Count of number of number of indices needed from this_pe */ outlist[this_pe] = j-i; - hypre_MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_DOUBLE, this_pe, 555, + hypre_MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_REAL, this_pe, 555, comm, &mat->recv_req[mat->num_recv]); - hypre_MPI_Send_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_DOUBLE, this_pe, 666, + hypre_MPI_Send_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_REAL, this_pe, 666, comm, &mat->send_req2[mat->num_recv]); mat->num_recv++; @@ -665,11 +665,11 @@ &requests[mat->num_send]); /* Set up the send */ - hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 555, comm, + hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 555, comm, &mat->send_req[mat->num_send]); /* Set up the receive for the transpose */ - hypre_MPI_Recv_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 666, comm, + hypre_MPI_Recv_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 666, comm, &mat->recv_req2[mat->num_send]); mat->num_send++; diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/ParaSails.c hypre-2.13.0/src/distributed_ls/ParaSails/ParaSails.c --- hypre-2.11.2/src/distributed_ls/ParaSails/ParaSails.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/ParaSails/ParaSails.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,9 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - /****************************************************************************** * * ParaSails - Parallel sparse approximate inverse least squares. @@ -33,6 +30,8 @@ #include "LoadBal.h" #include "ParaSails.h" +#include "_hypre_lapack.h" + #define ROW_PRUNED_REQ_TAG 221 #define ROW_STORED_REQ_TAG 222 #define ROW_REPI_TAG 223 @@ -40,20 +39,6 @@ #ifdef ESSL #include -#else -#ifdef __cplusplus -extern "C" -{ -#endif -HYPRE_Int hypre_F90_NAME_LAPACK(dpotrf, DPOTRF)(char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, - HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dgels, DGELS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, - HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -#ifdef __cplusplus -} -#endif - #endif #if 0 /* no longer need this since using 'memset' now */ @@ -371,7 +356,7 @@ hypre_MPI_Request_free(request); - hypre_MPI_Isend(valbuf, valbufp-valbuf, hypre_MPI_DOUBLE, dest, ROW_REPV_TAG, + hypre_MPI_Isend(valbuf, valbufp-valbuf, hypre_MPI_REAL, dest, ROW_REPV_TAG, comm, request); } @@ -400,7 +385,7 @@ ind = StoredRowsAllocInd(stored_rows, count); hypre_MPI_Recv(ind, count, HYPRE_MPI_INT, source, ROW_REPI_TAG, comm, &status); val = StoredRowsAllocVal(stored_rows, count); - hypre_MPI_Recv(val, count, hypre_MPI_DOUBLE, source, ROW_REPV_TAG, comm, &status); + hypre_MPI_Recv(val, count, hypre_MPI_REAL, source, ROW_REPV_TAG, comm, &status); /* Parse the message */ num_rows = *ind++; /* number of rows */ @@ -1149,7 +1134,7 @@ dpps(ahat, len, val, 1); #else /* Solve local linear system - factor phase */ - hypre_F90_NAME_LAPACK(dpotrf, DPOTRF)(&uplo, &len, ahat, &len, &info); + hypre_dpotrf(&uplo, &len, ahat, &len, &info); if (info != 0) { #if 0 @@ -1163,8 +1148,7 @@ } /* Solve local linear system - solve phase */ - hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &len, &one, ahat, &len, val, &len, - &info); + hypre_dpotrs(&uplo, &len, &one, ahat, &len, val, &len, &info); if (info != 0) { #if 0 @@ -1339,7 +1323,7 @@ &info, work, work_size); #else /* rhs in bhat, and put solution in bhat */ - hypre_F90_NAME_LAPACK(dgels, DGELS)(&trans, &npat, &len, &one, ahat, &npat, + hypre_dgels(&trans, &npat, &len, &one, ahat, &npat, bhat, &npat, work, &work_size, &info); if (info != 0) @@ -1430,7 +1414,7 @@ } /* Find the average across all processors */ - hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm); + hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm); hypre_MPI_Comm_size(comm, &npes); free(buffer); @@ -1486,7 +1470,7 @@ } /* Find the average across all processors */ - hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm); + hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm); hypre_MPI_Comm_size(comm, &npes); free(buffer); @@ -1980,9 +1964,9 @@ } hypre_MPI_Allreduce(&ps->setup_pattern_time, &max_pattern_time, - 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm); - hypre_MPI_Allreduce(&ps->cost, &max_cost, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm); - hypre_MPI_Allreduce(&ps->cost, &ave_cost, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm); + 1, hypre_MPI_REAL, hypre_MPI_MAX, comm); + hypre_MPI_Allreduce(&ps->cost, &max_cost, 1, hypre_MPI_REAL, hypre_MPI_MAX, comm); + hypre_MPI_Allreduce(&ps->cost, &ave_cost, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm); ave_cost = ave_cost / (HYPRE_Real) npes; if (mype) @@ -2029,13 +2013,13 @@ } hypre_MPI_Allreduce(&ps->setup_values_time, &max_values_time, - 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm); + 1, hypre_MPI_REAL, hypre_MPI_MAX, comm); if (!mype) setup_times = (HYPRE_Real *) malloc(npes * sizeof(HYPRE_Real)); temp = ps->setup_pattern_time + ps->setup_values_time; - hypre_MPI_Gather(&temp, 1, hypre_MPI_DOUBLE, setup_times, 1, hypre_MPI_DOUBLE, 0, comm); + hypre_MPI_Gather(&temp, 1, hypre_MPI_REAL, setup_times, 1, hypre_MPI_REAL, 0, comm); if (mype) return; diff -Nru hypre-2.11.2/src/distributed_ls/pilut/comm.c hypre-2.13.0/src/distributed_ls/pilut/comm.c --- hypre-2.11.2/src/distributed_ls/pilut/comm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/pilut/comm.c 2017-10-20 17:42:22.000000000 +0000 @@ -77,7 +77,7 @@ HYPRE_Real hypre_GlobalSEMaxDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context) { HYPRE_Real max; - hypre_MPI_Allreduce( &value, &max, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, hypre_MPI_Context ); + hypre_MPI_Allreduce( &value, &max, 1, hypre_MPI_REAL, hypre_MPI_MAX, hypre_MPI_Context ); return max; } @@ -88,7 +88,7 @@ HYPRE_Real hypre_GlobalSEMinDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context) { HYPRE_Real min; - hypre_MPI_Allreduce( &value, &min, 1, hypre_MPI_DOUBLE, hypre_MPI_MIN, hypre_MPI_Context ); + hypre_MPI_Allreduce( &value, &min, 1, hypre_MPI_REAL, hypre_MPI_MIN, hypre_MPI_Context ); return min; } @@ -99,7 +99,7 @@ HYPRE_Real hypre_GlobalSESumDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context) { HYPRE_Real sum; - hypre_MPI_Allreduce( &value, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, hypre_MPI_Context ); + hypre_MPI_Allreduce( &value, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, hypre_MPI_Context ); return sum; } diff -Nru hypre-2.11.2/src/distributed_ls/pilut/macros.h hypre-2.13.0/src/distributed_ls/pilut/macros.h --- hypre-2.11.2/src/distributed_ls/pilut/macros.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/pilut/macros.h 2017-10-20 17:42:22.000000000 +0000 @@ -83,13 +83,13 @@ #else #ifdef MACHINE_IS_SOLARIS #ifdef USE_SHORT -#define SNRM2 hypre_F90_NAME_BLAS(snrm2, SNRM2) -#define SDOT hypre_F90_NAME_BLAS(sdot, SDOT) -#define SCOPY hypre_F90_NAME_BLAS(scopy, SCOPY) +#define SNRM2 hypre_snrm2 +#define SDOT hypre_sdot +#define SCOPY hypre_scopy #else -#define SNRM2 hypre_F90_NAME_BLAS(dnrm2, DNRM2) -#define SDOT hypre_F90_NAME_BLAS(ddot, DDOT) -#define SCOPY hypre_F90_NAME_BLAS(dcopy, DCOPY) +#define SNRM2 hypre_dnrm2 +#define SDOT hypre_ddot +#define SCOPY hypre_dcopy #endif #else #ifdef USE_SHORT diff -Nru hypre-2.11.2/src/distributed_ls/pilut/parilut.c hypre-2.13.0/src/distributed_ls/pilut/parilut.c --- hypre-2.11.2/src/distributed_ls/pilut/parilut.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/distributed_ls/pilut/parilut.c 2017-10-20 17:42:22.000000000 +0000 @@ -503,7 +503,7 @@ hypre_MPI_Irecv( incolind+j, cnt, HYPRE_MPI_INT, penum, TAG_Send_colind, pilut_comm, &index_requests[i] ); - hypre_MPI_Irecv( invalues+j, cnt, hypre_MPI_DOUBLE, + hypre_MPI_Irecv( invalues+j, cnt, hypre_MPI_REAL, penum, TAG_Send_values, pilut_comm, &value_requests[i] ); j += cnt; @@ -550,7 +550,7 @@ /* send values to each neighbor */ for (i=0; i 0 ) { /* Something to recv */ - hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_DOUBLE, + hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_REAL, rpes[i], TAG, pilut_comm, &receive_requests[i] ); rdone[i] += rnum[i] ; @@ -139,7 +139,7 @@ for (j=auxsptr[i], l=0; j 0 ) { /* Something to recv */ - hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_DOUBLE, + hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_REAL, rpes[i], TAG, pilut_comm, &receive_requests[ i ] ); rdone[i] += rnum[i] ; @@ -225,7 +225,7 @@ for (j=auxsptr[i], l=0; j=nnodes[ii-1]; j++, l++) gatherbuf[l] = ux[sindex[j]]; - hypre_MPI_Send( gatherbuf, l, hypre_MPI_DOUBLE, + hypre_MPI_Send( gatherbuf, l, hypre_MPI_REAL, spes[i], TAG, pilut_comm ); auxsptr[i] = j; diff -Nru hypre-2.11.2/src/docs/usr_solvers.tex hypre-2.13.0/src/docs/usr_solvers.tex --- hypre-2.11.2/src/docs/usr_solvers.tex 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/docs/usr_solvers.tex 2017-10-20 17:42:22.000000000 +0000 @@ -36,6 +36,7 @@ AMS & & X & X & X \\ ADS & & X & X & X \\ MLI & & X & X & X \\ +MGR & & & & X \\ ParaSails & & X & X & X \\ Euclid & & X & X & X \\ PILUT & & X & X & X \\ @@ -476,10 +477,10 @@ e.g. block, Schwarz, ILU smoothers, it is necessary to use \code{HYPRE_BoomerAMGSetSmoothType} and \code{HYPRE_BoomerAMGSetSmoothNumLevels}. There are further parameter choices for the individual smoothers, which are described in the reference manual. -The default relaxation type is hybrid Gauss-Seidel with CF-relaxation (relax first the C-, -then the F-points) on the down cycle and FC-relaxation on the up-cycle. Note that if +The default relaxation type is l1-Gauss-Seidel, using a forward solve +on the down cycle and a backward solve on the up-cycle, to keep symmetry. Note that if BoomerAMG is used as a preconditioner for conjugate gradient, it is necessary to use -a symmetric smoother such as weighted Jacobi or hybrid symmetric Gauss-Seidel. +a symmetric smoother. Other symmetric options are weighted Jacobi or hybrid symmetric Gauss-Seidel. \subsection{AMG for systems of PDEs} @@ -1190,6 +1191,49 @@ %----------------------------------------------------------------------------- +\section{Multigrid Reduction (MGR)} + +MGR is a parallel multigrid reduction solver and preconditioner designed to take advantage +of use-provided information to solve systems of equations with multiple vatiable types. +The algorithm is similar to two-stage preconditioner strategies and other reduction +techniques like ARMS, but in a standard multigrid framework. + +The MGR algorithm accepts information about the variables in block form from the +user and uses it to define the appropriate C/F splitting for the multigrid scheme. +The linear system solve proceeds with an F-relaxation solve on the F points, +folowed by a coarse grid correction. The coarse grid solve is handled by scalar +AMG (BoomerAMG). MGR provides users with more control over the coarsening process, +and can potentially be a starting point for designing multigrid-based physics-based preconditioners. + +The following represents a minimal set of functions, and some optional functions, to call to +use the MGR solver. For simplicity, we ignore the function parameters here, and refer the +reader to the reference manual for more details on the parameters and their defaults. + +\begin{itemize} +\item {{\tt HYPRE\_MGRCreate:} Create the MGR solver object.} +\item {{\tt HYPRE\_MGRSetCpointsByBlock:} Set up block data with information about coarse indexes +for reduction. Here, the user specifies the number of reduction levels, as well as the the coarse nodes +for each level of the reduction. These coarse nodes are indexed by their index in the block of unknowns. +This is used internally to tag the appropriate indexes of the linear system matrix as coarse nodes.} +\item {(Optional) {\tt HYPRE\_MGRSetReservedCoarseNodes:} Prescribe a subset of nodes to be kept as coarse +nodes until the coarsest level. These nodes are transferred onto the coarsest grid of the BoomerAMG +coarse grid solver.} +\item {(Optional) {\tt HYPRE\_MGRSetNonCpointsToFpoints:} Set points not prescribed as C points to be fixed +as F points for intermediate levels. Setting this to 1 uses the user input to define the C/F splitting. +Otherwise, a BoomerAMG coarsening routine is used to determine the C/F splitting for intermediate levels.} +\item {(Optional) {\tt HYPRE\_MGRSetCoarseSolver:} This function sets the BoomerAMG solver to be used for the +solve on the coarse grid. The user can define their own BoomerAMG solver with their preferred options and +pass this to the MGR solver. Otherwise, an internal BoomerAMG solver is used as the coarse grid solver +instead.} +\item {{\tt HYPRE\_MGRSetup:} Setup and MGR solver object.} +\item {{\tt HYPRE\_MGRSolve:} Solve the linear system.} +\item {{\tt HYPRE\_MGRDestroy:} Destroy the MGR solver object} +\end{itemize} + +For more details about additional solver options and parameters, please refer to the reference manual. +NOTE: The MGR solver is currently only supported by the IJ interface. +%----------------------------------------------------------------------------- + \section{ParaSails} ParaSails is a parallel implementation of a sparse approximate inverse diff -Nru hypre-2.11.2/src/examples/ex12f.f hypre-2.13.0/src/examples/ex12f.f --- hypre-2.11.2/src/examples/ex12f.f 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/examples/ex12f.f 2017-10-20 17:42:22.000000000 +0000 @@ -405,7 +405,7 @@ call HYPRE_StructPFMGCreate(MPI_COMM_WORLD, precond, ierr) c Set PFMG parameters call HYPRE_StructPFMGSetMaxIter(precond, 1, ierr) - call HYPRE_StructPFMGSetTol(precond, 0.0, ierr) + call HYPRE_StructPFMGSetTol(precond, 0.0d0, ierr) call HYPRE_StructPFMGSetZeroGuess(precond, ierr) call HYPRE_StructPFMGSetNumPreRelax(precond, 2, ierr) call HYPRE_StructPFMGSetNumPostRelax(precond, 2, ierr) diff -Nru hypre-2.11.2/src/examples/README_files/ex12f.f.html hypre-2.13.0/src/examples/README_files/ex12f.f.html --- hypre-2.11.2/src/examples/README_files/ex12f.f.html 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/examples/README_files/ex12f.f.html 2017-10-20 17:42:22.000000000 +0000 @@ -412,7 +412,7 @@ 405 call HYPRE_StructPFMGCreate(MPI_COMM_WORLD, precond, ierr) 406 c Set PFMG parameters 407 call HYPRE_StructPFMGSetMaxIter(precond, 1, ierr) -408 call HYPRE_StructPFMGSetTol(precond, 0.0, ierr) +408 call HYPRE_StructPFMGSetTol(precond, 0.0d0, ierr) 409 call HYPRE_StructPFMGSetZeroGuess(precond, ierr) 410 call HYPRE_StructPFMGSetNumPreRelax(precond, 2, ierr) 411 call HYPRE_StructPFMGSetNumPostRelax(precond, 2, ierr) diff -Nru hypre-2.11.2/src/FEI_mv/CMakeLists.txt hypre-2.13.0/src/FEI_mv/CMakeLists.txt --- hypre-2.11.2/src/FEI_mv/CMakeLists.txt 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/CMakeLists.txt 2017-10-20 17:42:22.000000000 +0000 @@ -5,25 +5,25 @@ include_directories(fei-hypre) include_directories(ml/src/include) -option(HYPRE_USING_SUPERLU "Use internal SuperLU routines" ON) -option(HYPRE_USING_MLI "Use MLI" ON) - -if(HYPRE_USING_SUPERLU) - include_directories(SuperLU/SRC) - add_definitions(-DHAVE_SUPERLU) - add_subdirectory(SuperLU) - set(FEI_LIBS ${FEI_LIBS} $) -endif() - -if(HYPRE_USING_MLI) - include_directories(femli) - add_definitions(-DHAVE_MLI) - if(HYPRE_USING_SUPERLU) - add_definitions(-DMLI_SUPERLU) - endif() - add_subdirectory(femli) - set(FEI_LIBS ${FEI_LIBS} $) -endif() +# option(HYPRE_USING_SUPERLU "Use internal SuperLU routines" ON) +# option(HYPRE_USING_MLI "Use MLI" ON) +# +# if(HYPRE_USING_SUPERLU) +# include_directories(SuperLU/SRC) +# add_definitions(-DHAVE_SUPERLU) +# add_subdirectory(SuperLU) +# set(FEI_LIBS ${FEI_LIBS} $) +# endif() +# +# if(HYPRE_USING_MLI) +# include_directories(femli) +# add_definitions(-DHAVE_MLI) +# if(HYPRE_USING_SUPERLU) +# add_definitions(-DMLI_SUPERLU) +# endif() +# add_subdirectory(femli) +# set(FEI_LIBS ${FEI_LIBS} $) +# endif() add_subdirectory(fei-hypre) set(FEI_LIBS ${FEI_LIBS} $) diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/caxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/caxpy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/caxpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/caxpy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,90 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int caxpy_(integer *n, complex *ca, complex *cx, integer * - incx, complex *cy, integer *incy) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3, i__4; - real r__1, r__2; - complex q__1, q__2; - - /* Builtin functions */ - double r_imag(complex *); - - /* Local variables */ - static integer i, ix, iy; - - -/* constant times a vector plus a vector. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define CY(I) cy[(I)-1] -#define CX(I) cx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if ((r__1 = ca->r, dabs(r__1)) + (r__2 = r_imag(ca), dabs(r__2)) == 0.f) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - i__3 = iy; - i__4 = ix; - q__2.r = ca->r * CX(ix).r - ca->i * CX(ix).i, q__2.i = ca->r * CX( - ix).i + ca->i * CX(ix).r; - q__1.r = CY(iy).r + q__2.r, q__1.i = CY(iy).i + q__2.i; - CY(iy).r = q__1.r, CY(iy).i = q__1.i; - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - i__4 = i; - q__2.r = ca->r * CX(i).r - ca->i * CX(i).i, q__2.i = ca->r * CX( - i).i + ca->i * CX(i).r; - q__1.r = CY(i).r + q__2.r, q__1.i = CY(i).i + q__2.i; - CY(i).r = q__1.r, CY(i).i = q__1.i; -/* L30: */ - } - return 0; -} /* caxpy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ccopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ccopy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ccopy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ccopy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,74 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int ccopy_(integer *n, complex *cx, integer *incx, complex * - cy, integer *incy) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3; - - /* Local variables */ - static integer i, ix, iy; - - -/* copies a vector, x, to a vector, y. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define CY(I) cy[(I)-1] -#define CX(I) cx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - i__3 = ix; - CY(iy).r = CX(ix).r, CY(iy).i = CX(ix).i; - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - CY(i).r = CX(i).r, CY(i).i = CX(i).i; -/* L30: */ - } - return 0; -} /* ccopy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cdotc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cdotc.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cdotc.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cdotc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,87 +0,0 @@ -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Complex */ VOID cdotc_(complex * ret_val, integer *n, complex *cx, integer - *incx, complex *cy, integer *incy) -{ - /* System generated locals */ - integer i__1, i__2; - complex q__1, q__2, q__3; - - /* Builtin functions */ - void r_cnjg(complex *, complex *); - - /* Local variables */ - static integer i; - static complex ctemp; - static integer ix, iy; - - -/* forms the dot product of two vectors, conjugating the first - vector. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments */ - --cy; - --cx; - - /* Function Body */ - ctemp.r = 0.f, ctemp.i = 0.f; - ret_val->r = 0.f, ret_val->i = 0.f; - if (*n <= 0) { - return ; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - r_cnjg(&q__3, &cx[ix]); - i__2 = iy; - q__2.r = q__3.r * cy[iy].r - q__3.i * cy[iy].i, q__2.i = q__3.r * - cy[iy].i + q__3.i * cy[iy].r; - q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i; - ctemp.r = q__1.r, ctemp.i = q__1.i; - ix += *incx; - iy += *incy; -/* L10: */ - } - ret_val->r = ctemp.r, ret_val->i = ctemp.i; - return ; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - r_cnjg(&q__3, &cx[i]); - i__2 = i; - q__2.r = q__3.r * cy[i].r - q__3.i * cy[i].i, q__2.i = q__3.r * - cy[i].i + q__3.i * cy[i].r; - q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i; - ctemp.r = q__1.r, ctemp.i = q__1.i; -/* L30: */ - } - ret_val->r = ctemp.r, ret_val->i = ctemp.i; - return ; -} /* cdotc_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgemv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgemv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,399 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int cgemv_(char *trans, integer *m, integer *n, complex * - alpha, complex *a, integer *lda, complex *x, integer *incx, complex * - beta, complex *y, integer *incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2, q__3; - - /* Builtin functions */ - void r_cnjg(complex *, complex *); - - /* Local variables */ - static integer info; - static complex temp; - static integer lenx, leny, i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical noconj; - - -/* Purpose - ======= - - CGEMV performs one of the matrix-vector operations - - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or - - y := alpha*conjg( A' )*x + beta*y, - - where alpha and beta are scalars, x and y are vectors and A is an - m by n matrix. - - Parameters - ========== - - TRANS - CHARACTER*1. - On entry, TRANS specifies the operation to be performed as - follows: - - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - - TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - COMPLEX array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - X - COMPLEX array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - COMPLEX . - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - COMPLEX array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - - updated vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! - lsame_(trans, "C")) { - info = 1; - } else if (*m < 0) { - info = 2; - } else if (*n < 0) { - info = 3; - } else if (*lda < max(1,*m)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } else if (*incy == 0) { - info = 11; - } - if (info != 0) { - xerbla_("CGEMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r - == 1.f && beta->i == 0.f)) { - return 0; - } - - noconj = lsame_(trans, "T"); - -/* Set LENX and LENY, the lengths of the vectors x and y, and set - - up the start points in X and Y. */ - - if (lsame_(trans, "N")) { - lenx = *n; - leny = *m; - } else { - lenx = *m; - leny = *n; - } - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (lenx - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (leny - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. - - First form y := beta*y. */ - - if (beta->r != 1.f || beta->i != 0.f) { - if (*incy == 1) { - if (beta->r == 0.f && beta->i == 0.f) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = i; - Y(i).r = 0.f, Y(i).i = 0.f; -/* L10: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = i; - i__3 = i; - q__1.r = beta->r * Y(i).r - beta->i * Y(i).i, - q__1.i = beta->r * Y(i).i + beta->i * Y(i) - .r; - Y(i).r = q__1.r, Y(i).i = q__1.i; -/* L20: */ - } - } - } else { - iy = ky; - if (beta->r == 0.f && beta->i == 0.f) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = iy; - Y(iy).r = 0.f, Y(iy).i = 0.f; - iy += *incy; -/* L30: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = iy; - i__3 = iy; - q__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, - q__1.i = beta->r * Y(iy).i + beta->i * Y(iy) - .r; - Y(iy).r = q__1.r, Y(iy).i = q__1.i; - iy += *incy; -/* L40: */ - } - } - } - } - if (alpha->r == 0.f && alpha->i == 0.f) { - return 0; - } - if (lsame_(trans, "N")) { - -/* Form y := alpha*A*x + y. */ - - jx = kx; - if (*incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - if (X(jx).r != 0.f || X(jx).i != 0.f) { - i__2 = jx; - q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - q__1.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - temp.r = q__1.r, temp.i = q__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - q__2.i = temp.r * A(i,j).i + temp.i * A(i,j) - .r; - q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + - q__2.i; - Y(i).r = q__1.r, Y(i).i = q__1.i; -/* L50: */ - } - } - jx += *incx; -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - if (X(jx).r != 0.f || X(jx).i != 0.f) { - i__2 = jx; - q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - q__1.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - temp.r = q__1.r, temp.i = q__1.i; - iy = ky; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = iy; - i__4 = iy; - i__5 = i + j * a_dim1; - q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - q__2.i = temp.r * A(i,j).i + temp.i * A(i,j) - .r; - q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + - q__2.i; - Y(iy).r = q__1.r, Y(iy).i = q__1.i; - iy += *incy; -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } else { - -/* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. - */ - - jy = ky; - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp.r = 0.f, temp.i = 0.f; - if (noconj) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i; - q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(i) - .i, q__2.i = A(i,j).r * X(i).i + A(i,j) - .i * X(i).r; - q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i; - temp.r = q__1.r, temp.i = q__1.i; -/* L90: */ - } - } else { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - r_cnjg(&q__3, &A(i,j)); - i__3 = i; - q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, - q__2.i = q__3.r * X(i).i + q__3.i * X(i) - .r; - q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i; - temp.r = q__1.r, temp.i = q__1.i; -/* L100: */ - } - } - i__2 = jy; - i__3 = jy; - q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i = - alpha->r * temp.i + alpha->i * temp.r; - q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i; - Y(jy).r = q__1.r, Y(jy).i = q__1.i; - jy += *incy; -/* L110: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp.r = 0.f, temp.i = 0.f; - ix = kx; - if (noconj) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = ix; - q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(ix) - .i, q__2.i = A(i,j).r * X(ix).i + A(i,j) - .i * X(ix).r; - q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i; - temp.r = q__1.r, temp.i = q__1.i; - ix += *incx; -/* L120: */ - } - } else { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - r_cnjg(&q__3, &A(i,j)); - i__3 = ix; - q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, - q__2.i = q__3.r * X(ix).i + q__3.i * X(ix) - .r; - q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i; - temp.r = q__1.r, temp.i = q__1.i; - ix += *incx; -/* L130: */ - } - } - i__2 = jy; - i__3 = jy; - q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i = - alpha->r * temp.i + alpha->i * temp.r; - q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i; - Y(jy).r = q__1.r, Y(jy).i = q__1.i; - jy += *incy; -/* L140: */ - } - } - } - - return 0; - -/* End of CGEMV . */ - -} /* cgemv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgerc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgerc.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgerc.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgerc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,205 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int cgerc_(integer *m, integer *n, complex *alpha, complex * - x, integer *incx, complex *y, integer *incy, complex *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2; - - /* Builtin functions */ - void r_cnjg(complex *, complex *); - - /* Local variables */ - static integer info; - static complex temp; - static integer i, j, ix, jy, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - CGERC performs the rank 1 operation - - A := alpha*x*conjg( y' ) + A, - - where alpha is a scalar, x is an m element vector, y is an n element - - vector and A is an m by n matrix. - - Parameters - ========== - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - COMPLEX array of dimension at least - ( 1 + ( m - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the m - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - COMPLEX array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. On exit, A is - overwritten by the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (*m < 0) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*m)) { - info = 9; - } - if (info != 0) { - xerbla_("CGERC ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) { - return 0; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (*incy > 0) { - jy = 1; - } else { - jy = 1 - (*n - 1) * *incy; - } - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0.f || Y(jy).i != 0.f) { - r_cnjg(&q__2, &Y(jy)); - q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = - alpha->r * q__2.i + alpha->i * q__2.r; - temp.r = q__1.r, temp.i = q__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - q__2.r = X(i).r * temp.r - X(i).i * temp.i, q__2.i = - X(i).r * temp.i + X(i).i * temp.r; - q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; -/* L10: */ - } - } - jy += *incy; -/* L20: */ - } - } else { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*m - 1) * *incx; - } - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0.f || Y(jy).i != 0.f) { - r_cnjg(&q__2, &Y(jy)); - q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = - alpha->r * q__2.i + alpha->i * q__2.r; - temp.r = q__1.r, temp.i = q__1.i; - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - q__2.r = X(ix).r * temp.r - X(ix).i * temp.i, q__2.i = - X(ix).r * temp.i + X(ix).i * temp.r; - q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; - ix += *incx; -/* L30: */ - } - } - jy += *incy; -/* L40: */ - } - } - - return 0; - -/* End of CGERC . */ - -} /* cgerc_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgeru.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgeru.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgeru.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgeru.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,202 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int cgeru_(integer *m, integer *n, complex *alpha, complex * - x, integer *incx, complex *y, integer *incy, complex *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2; - - /* Local variables */ - static integer info; - static complex temp; - static integer i, j, ix, jy, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - CGERU performs the rank 1 operation - - A := alpha*x*y' + A, - - where alpha is a scalar, x is an m element vector, y is an n element - - vector and A is an m by n matrix. - - Parameters - ========== - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - COMPLEX array of dimension at least - ( 1 + ( m - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the m - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - COMPLEX array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. On exit, A is - overwritten by the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (*m < 0) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*m)) { - info = 9; - } - if (info != 0) { - xerbla_("CGERU ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) { - return 0; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (*incy > 0) { - jy = 1; - } else { - jy = 1 - (*n - 1) * *incy; - } - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0.f || Y(jy).i != 0.f) { - i__2 = jy; - q__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, q__1.i = - alpha->r * Y(jy).i + alpha->i * Y(jy).r; - temp.r = q__1.r, temp.i = q__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - q__2.r = X(i).r * temp.r - X(i).i * temp.i, q__2.i = - X(i).r * temp.i + X(i).i * temp.r; - q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; -/* L10: */ - } - } - jy += *incy; -/* L20: */ - } - } else { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*m - 1) * *incx; - } - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0.f || Y(jy).i != 0.f) { - i__2 = jy; - q__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, q__1.i = - alpha->r * Y(jy).i + alpha->i * Y(jy).r; - temp.r = q__1.r, temp.i = q__1.i; - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - q__2.r = X(ix).r * temp.r - X(ix).i * temp.i, q__2.i = - X(ix).r * temp.i + X(ix).i * temp.r; - q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; - ix += *incx; -/* L30: */ - } - } - jy += *incy; -/* L40: */ - } - } - - return 0; - -/* End of CGERU . */ - -} /* cgeru_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/chemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/chemv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/chemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/chemv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,421 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int chemv_(char *uplo, integer *n, complex *alpha, complex * - a, integer *lda, complex *x, integer *incx, complex *beta, complex *y, - integer *incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - doublereal d__1; - complex q__1, q__2, q__3, q__4; - - /* Builtin functions */ - void r_cnjg(complex *, complex *); - - /* Local variables */ - static integer info; - static complex temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - CHEMV performs the matrix-vector operation - - y := alpha*A*x + beta*y, - - where alpha and beta are scalars, x and y are n element vectors and - A is an n by n hermitian matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - COMPLEX array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the hermitian matrix and the strictly - lower triangular part of A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the hermitian matrix and the strictly - upper triangular part of A is not referenced. - Note that the imaginary parts of the diagonal elements need - - not be set and are assumed to be zero. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - COMPLEX . - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. On exit, Y is overwritten by the updated - vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*lda < max(1,*n)) { - info = 5; - } else if (*incx == 0) { - info = 7; - } else if (*incy == 0) { - info = 10; - } - if (info != 0) { - xerbla_("CHEMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && - beta->i == 0.f)) { - return 0; - } - -/* Set up the start points in X and Y. */ - - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. - - First form y := beta*y. */ - - if (beta->r != 1.f || beta->i != 0.f) { - if (*incy == 1) { - if (beta->r == 0.f && beta->i == 0.f) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - Y(i).r = 0.f, Y(i).i = 0.f; -/* L10: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - q__1.r = beta->r * Y(i).r - beta->i * Y(i).i, - q__1.i = beta->r * Y(i).i + beta->i * Y(i) - .r; - Y(i).r = q__1.r, Y(i).i = q__1.i; -/* L20: */ - } - } - } else { - iy = ky; - if (beta->r == 0.f && beta->i == 0.f) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - Y(iy).r = 0.f, Y(iy).i = 0.f; - iy += *incy; -/* L30: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - i__3 = iy; - q__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, - q__1.i = beta->r * Y(iy).i + beta->i * Y(iy) - .r; - Y(iy).r = q__1.r, Y(iy).i = q__1.i; - iy += *incy; -/* L40: */ - } - } - } - } - if (alpha->r == 0.f && alpha->i == 0.f) { - return 0; - } - if (lsame_(uplo, "U")) { - -/* Form y when A is stored in upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - q__1.r = alpha->r * X(j).r - alpha->i * X(j).i, q__1.i = - alpha->r * X(j).i + alpha->i * X(j).r; - temp1.r = q__1.r, temp1.i = q__1.i; - temp2.r = 0.f, temp2.i = 0.f; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + q__2.i; - Y(i).r = q__1.r, Y(i).i = q__1.i; - r_cnjg(&q__3, &A(i,j)); - i__3 = i; - q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, q__2.i = - q__3.r * X(i).i + q__3.i * X(i).r; - q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; - temp2.r = q__1.r, temp2.i = q__1.i; -/* L50: */ - } - i__2 = j; - i__3 = j; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - q__3.r = d__1 * temp1.r, q__3.i = d__1 * temp1.i; - q__2.r = Y(j).r + q__3.r, q__2.i = Y(j).i + q__3.i; - q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = - alpha->r * temp2.i + alpha->i * temp2.r; - q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; - Y(j).r = q__1.r, Y(j).i = q__1.i; -/* L60: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, q__1.i = - alpha->r * X(jx).i + alpha->i * X(jx).r; - temp1.r = q__1.r, temp1.i = q__1.i; - temp2.r = 0.f, temp2.i = 0.f; - ix = kx; - iy = ky; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = iy; - i__4 = iy; - i__5 = i + j * a_dim1; - q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + q__2.i; - Y(iy).r = q__1.r, Y(iy).i = q__1.i; - r_cnjg(&q__3, &A(i,j)); - i__3 = ix; - q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, q__2.i = - q__3.r * X(ix).i + q__3.i * X(ix).r; - q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; - temp2.r = q__1.r, temp2.i = q__1.i; - ix += *incx; - iy += *incy; -/* L70: */ - } - i__2 = jy; - i__3 = jy; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - q__3.r = d__1 * temp1.r, q__3.i = d__1 * temp1.i; - q__2.r = Y(jy).r + q__3.r, q__2.i = Y(jy).i + q__3.i; - q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = - alpha->r * temp2.i + alpha->i * temp2.r; - q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; - Y(jy).r = q__1.r, Y(jy).i = q__1.i; - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } else { - -/* Form y when A is stored in lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - q__1.r = alpha->r * X(j).r - alpha->i * X(j).i, q__1.i = - alpha->r * X(j).i + alpha->i * X(j).r; - temp1.r = q__1.r, temp1.i = q__1.i; - temp2.r = 0.f, temp2.i = 0.f; - i__2 = j; - i__3 = j; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - q__2.r = d__1 * temp1.r, q__2.i = d__1 * temp1.i; - q__1.r = Y(j).r + q__2.r, q__1.i = Y(j).i + q__2.i; - Y(j).r = q__1.r, Y(j).i = q__1.i; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + q__2.i; - Y(i).r = q__1.r, Y(i).i = q__1.i; - r_cnjg(&q__3, &A(i,j)); - i__3 = i; - q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, q__2.i = - q__3.r * X(i).i + q__3.i * X(i).r; - q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; - temp2.r = q__1.r, temp2.i = q__1.i; -/* L90: */ - } - i__2 = j; - i__3 = j; - q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = - alpha->r * temp2.i + alpha->i * temp2.r; - q__1.r = Y(j).r + q__2.r, q__1.i = Y(j).i + q__2.i; - Y(j).r = q__1.r, Y(j).i = q__1.i; -/* L100: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, q__1.i = - alpha->r * X(jx).i + alpha->i * X(jx).r; - temp1.r = q__1.r, temp1.i = q__1.i; - temp2.r = 0.f, temp2.i = 0.f; - i__2 = jy; - i__3 = jy; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - q__2.r = d__1 * temp1.r, q__2.i = d__1 * temp1.i; - q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i; - Y(jy).r = q__1.r, Y(jy).i = q__1.i; - ix = jx; - iy = jy; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - iy += *incy; - i__3 = iy; - i__4 = iy; - i__5 = i + j * a_dim1; - q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + q__2.i; - Y(iy).r = q__1.r, Y(iy).i = q__1.i; - r_cnjg(&q__3, &A(i,j)); - i__3 = ix; - q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, q__2.i = - q__3.r * X(ix).i + q__3.i * X(ix).r; - q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; - temp2.r = q__1.r, temp2.i = q__1.i; -/* L110: */ - } - i__2 = jy; - i__3 = jy; - q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = - alpha->r * temp2.i + alpha->i * temp2.r; - q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i; - Y(jy).r = q__1.r, Y(jy).i = q__1.i; - jx += *incx; - jy += *incy; -/* L120: */ - } - } - } - - return 0; - -/* End of CHEMV . */ - -} /* chemv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cher2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cher2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cher2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cher2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,436 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int cher2_(char *uplo, integer *n, complex *alpha, complex * - x, integer *incx, complex *y, integer *incy, complex *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - doublereal d__1; - complex q__1, q__2, q__3, q__4; - - /* Builtin functions */ - void r_cnjg(complex *, complex *); - - /* Local variables */ - static integer info; - static complex temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - CHER2 performs the hermitian rank 2 operation - - A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, - - where alpha is a scalar, x and y are n element vectors and A is an n - - by n hermitian matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - COMPLEX array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the hermitian matrix and the strictly - lower triangular part of A is not referenced. On exit, the - upper triangular part of the array A is overwritten by the - upper triangular part of the updated matrix. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the hermitian matrix and the strictly - upper triangular part of A is not referenced. On exit, the - lower triangular part of the array A is overwritten by the - lower triangular part of the updated matrix. - Note that the imaginary parts of the diagonal elements need - - not be set, they are assumed to be zero, and on exit they - are set to zero. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*n)) { - info = 9; - } - if (info != 0) { - xerbla_("CHER2 ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f) { - return 0; - } - -/* Set up the start points in X and Y if the increments are not both - - unity. */ - - if (*incx != 1 || *incy != 1) { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - jx = kx; - jy = ky; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. */ - - if (lsame_(uplo, "U")) { - -/* Form A when A is stored in the upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - i__3 = j; - if (X(j).r != 0.f || X(j).i != 0.f || (Y(j).r != 0.f - || Y(j).i != 0.f)) { - r_cnjg(&q__2, &Y(j)); - q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = - alpha->r * q__2.i + alpha->i * q__2.r; - temp1.r = q__1.r, temp1.i = q__1.i; - i__2 = j; - q__2.r = alpha->r * X(j).r - alpha->i * X(j).i, - q__2.i = alpha->r * X(j).i + alpha->i * X(j) - .r; - r_cnjg(&q__1, &q__2); - temp2.r = q__1.r, temp2.i = q__1.i; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - q__3.r = X(i).r * temp1.r - X(i).i * temp1.i, - q__3.i = X(i).r * temp1.i + X(i).i * - temp1.r; - q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + - q__3.i; - i__6 = i; - q__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, - q__4.i = Y(i).r * temp2.i + Y(i).i * - temp2.r; - q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; -/* L10: */ - } - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = j; - q__2.r = X(j).r * temp1.r - X(j).i * temp1.i, - q__2.i = X(j).r * temp1.i + X(j).i * - temp1.r; - i__5 = j; - q__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, - q__3.i = Y(j).r * temp2.i + Y(j).i * - temp2.r; - q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; - d__1 = A(j,j).r + q__1.r; - A(j,j).r = d__1, A(j,j).i = 0.f; - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.f; - } -/* L20: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - i__3 = jy; - if (X(jx).r != 0.f || X(jx).i != 0.f || (Y(jy).r != 0.f - || Y(jy).i != 0.f)) { - r_cnjg(&q__2, &Y(jy)); - q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = - alpha->r * q__2.i + alpha->i * q__2.r; - temp1.r = q__1.r, temp1.i = q__1.i; - i__2 = jx; - q__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - q__2.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - r_cnjg(&q__1, &q__2); - temp2.r = q__1.r, temp2.i = q__1.i; - ix = kx; - iy = ky; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - q__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, - q__3.i = X(ix).r * temp1.i + X(ix).i * - temp1.r; - q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + - q__3.i; - i__6 = iy; - q__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, - q__4.i = Y(iy).r * temp2.i + Y(iy).i * - temp2.r; - q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; - ix += *incx; - iy += *incy; -/* L30: */ - } - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = jx; - q__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, - q__2.i = X(jx).r * temp1.i + X(jx).i * - temp1.r; - i__5 = jy; - q__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, - q__3.i = Y(jy).r * temp2.i + Y(jy).i * - temp2.r; - q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; - d__1 = A(j,j).r + q__1.r; - A(j,j).r = d__1, A(j,j).i = 0.f; - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.f; - } - jx += *incx; - jy += *incy; -/* L40: */ - } - } - } else { - -/* Form A when A is stored in the lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - i__3 = j; - if (X(j).r != 0.f || X(j).i != 0.f || (Y(j).r != 0.f - || Y(j).i != 0.f)) { - r_cnjg(&q__2, &Y(j)); - q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = - alpha->r * q__2.i + alpha->i * q__2.r; - temp1.r = q__1.r, temp1.i = q__1.i; - i__2 = j; - q__2.r = alpha->r * X(j).r - alpha->i * X(j).i, - q__2.i = alpha->r * X(j).i + alpha->i * X(j) - .r; - r_cnjg(&q__1, &q__2); - temp2.r = q__1.r, temp2.i = q__1.i; - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = j; - q__2.r = X(j).r * temp1.r - X(j).i * temp1.i, - q__2.i = X(j).r * temp1.i + X(j).i * - temp1.r; - i__5 = j; - q__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, - q__3.i = Y(j).r * temp2.i + Y(j).i * - temp2.r; - q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; - d__1 = A(j,j).r + q__1.r; - A(j,j).r = d__1, A(j,j).i = 0.f; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - q__3.r = X(i).r * temp1.r - X(i).i * temp1.i, - q__3.i = X(i).r * temp1.i + X(i).i * - temp1.r; - q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + - q__3.i; - i__6 = i; - q__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, - q__4.i = Y(i).r * temp2.i + Y(i).i * - temp2.r; - q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; -/* L50: */ - } - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.f; - } -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - i__3 = jy; - if (X(jx).r != 0.f || X(jx).i != 0.f || (Y(jy).r != 0.f - || Y(jy).i != 0.f)) { - r_cnjg(&q__2, &Y(jy)); - q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = - alpha->r * q__2.i + alpha->i * q__2.r; - temp1.r = q__1.r, temp1.i = q__1.i; - i__2 = jx; - q__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - q__2.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - r_cnjg(&q__1, &q__2); - temp2.r = q__1.r, temp2.i = q__1.i; - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = jx; - q__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, - q__2.i = X(jx).r * temp1.i + X(jx).i * - temp1.r; - i__5 = jy; - q__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, - q__3.i = Y(jy).r * temp2.i + Y(jy).i * - temp2.r; - q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; - d__1 = A(j,j).r + q__1.r; - A(j,j).r = d__1, A(j,j).i = 0.f; - ix = jx; - iy = jy; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - iy += *incy; - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - q__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, - q__3.i = X(ix).r * temp1.i + X(ix).i * - temp1.r; - q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + - q__3.i; - i__6 = iy; - q__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, - q__4.i = Y(iy).r * temp2.i + Y(iy).i * - temp2.r; - q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; - A(i,j).r = q__1.r, A(i,j).i = q__1.i; -/* L70: */ - } - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.f; - } - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } - - return 0; - -/* End of CHER2 . */ - -} /* cher2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Cnames.h hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Cnames.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Cnames.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Cnames.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,346 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. ADD_ assumes that - * they will be called by fortran, which expects C routines to have an - * underscore postfixed to the name (Suns, and the Intel expect this). - * NOCHANGE indicates that fortran will be calling, and that it expects - * the name called by fortran to be identical to that compiled by the C - * (RS6K's do this). UPCASE says it expects C routines called by fortran - * to be in all upcase (CRAY wants this). - */ - -#define ADD_ 0 -#define NOCHANGE 1 -#define UPCASE 2 -#define C_CALL 3 - -#ifdef UpCase -#define F77_CALL_C UPCASE -#endif - -#ifdef NoChange -#define F77_CALL_C NOCHANGE -#endif - -#ifdef Add_ -#define F77_CALL_C ADD_ -#endif - -#ifndef F77_CALL_C -#define F77_CALL_C ADD_ -#endif - -#if (F77_CALL_C == ADD_) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm_(...) - * - * This is the default. - */ -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle_ -#define f_create_options_handle f_create_options_handle_ -#define f_create_ScalePerm_handle f_create_scaleperm_handle_ -#define f_create_LUstruct_handle f_create_lustruct_handle_ -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle_ -#define f_create_SuperMatrix_handle f_create_supermatrix_handle_ -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_ -#define f_destroy_options_handle f_destroy_options_handle_ -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_ -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle_ -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_ -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_ -#define f_create_SuperLUStat_handle f_create_superlustat_handle_ -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_ -#define f_get_gridinfo f_get_gridinfo_ -#define f_get_SuperMatrix f_get_supermatrix_ -#define f_set_SuperMatrix f_set_supermatrix_ -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_ -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_ -#define f_get_superlu_options f_get_superlu_options_ -#define f_set_superlu_options f_set_superlu_options_ -#define f_set_default_options f_set_default_options_ -#define f_superlu_gridinit f_superlu_gridinit_ -#define f_superlu_gridexit f_superlu_gridexit_ -#define f_ScalePermstructInit f_scalepermstructinit_ -#define f_ScalePermstructFree f_scalepermstructfree_ -#define f_PStatInit f_pstatinit_ -#define f_PStatFree f_pstatfree_ -#define f_LUstructInit f_lustructinit_ -#define f_LUstructFree f_lustructfree_ -#define f_Destroy_LU f_destroy_lu_ -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_ -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_ -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_ -#define f_dSolveFinalize f_dsolvefinalize_ -#define f_pdgssvx f_pdgssvx_ -#define f_dcreate_dist_matrix f_dcreate_dist_matrix_ -#define f_check_malloc f_check_malloc_ -#endif - -#if (F77_CALL_C == UPCASE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void DGEMM(...) - */ -/* BLAS */ -#define sasum_ SASUM -#define isamax_ ISAMAX -#define scopy_ SCOPY -#define sscal_ SSCAL -#define sger_ SGER -#define snrm2_ SNRM2 -#define ssymv_ SSYMV -#define sdot_ SDOT -#define saxpy_ SAXPY -#define ssyr2_ SSYR2 -#define srot_ SROT -#define sgemv_ SGEMV -#define strsv_ STRSV -#define sgemm_ SGEMM -#define strsm_ STRSM - -#define dasum_ DASUM -#define idamax_ IDAMAX -#define dcopy_ DCOPY -#define dscal_ DSCAL -#define dger_ DGER -#define dnrm2_ DNRM2 -#define dsymv_ DSYMV -#define ddot_ DDOT -#define daxpy_ DAXPY -#define dsyr2_ DSYR2 -#define drot_ DROT -#define dgemv_ DGEMV -#define dtrsv_ DTRSV -#define dgemm_ DGEMM -#define dtrsm_ DTRSM - -#define scasum_ SCASUM -#define icamax_ ICAMAX -#define ccopy_ CCOPY -#define cscal_ CSCAL -#define scnrm2_ SCNRM2 -#define caxpy_ CAXPY -#define cgemv_ CGEMV -#define ctrsv_ CTRSV -#define cgemm_ CGEMM -#define ctrsm_ CTRSM -#define cgerc_ CGERC -#define chemv_ CHEMV -#define cher2_ CHER2 - -#define dzasum_ DZASUM -#define izamax_ IZAMAX -#define zcopy_ ZCOPY -#define zscal_ ZSCAL -#define dznrm2_ DZNRM2 -#define zaxpy_ ZAXPY -#define zgemv_ ZGEMV -#define ztrsv_ ZTRSV -#define zgemm_ ZGEMM -#define ztrsm_ ZTRSM -#define zgerc_ ZGERC -#define zhemv_ ZHEMV -#define zher2_ ZHER2 -#define zgeru_ ZGERU - -/* LAPACK */ -#define dlamch_ DLAMCH -#define slamch_ SLAMCH -#define xerbla_ XERBLA -#define lsame_ LSAME - -#define mc64id_ MC64ID -#define mc64ad_ MC64AD -#define c_bridge_dgssv_ C_BRIDGE_DGSSV -#define c_fortran_slugrid_ C_FORTRAN_SLUGRID -#define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX -#define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL -#define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX -#define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE -#define f_create_options_handle F_CREATE_OPTIONS_HANDLE -#define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE -#define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE -#define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE -#define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE -#define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE -#define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE -#define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE -#define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE -#define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE -#define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE -#define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE -#define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE -#define f_get_gridinfo F_GET_GRIDINFO -#define f_get_SuperMatrix F_GET_SUPERMATRIX -#define f_set_SuperMatrix F_SET_SUPERMATRIX -#define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX -#define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX -#define f_get_superlu_options F_GET_SUPERLU_OPTIONS -#define f_set_superlu_options F_SET_SUPERLU_OPTIONS -#define f_set_default_options F_SET_DEFAULT_OPTIONS -#define f_superlu_gridinit F_SUPERLU_GRIDINIT -#define f_superlu_gridexit F_SUPERLU_GRIDEXIT -#define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT -#define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE -#define f_PStatInit F_PSTATINIT -#define f_PStatFree F_PSTATFREE -#define f_LUstructInit F_LUSTRUCTINIT -#define f_LUstructFree F_LUSTRUCTFREE -#define f_Destroy_LU F_DESTROY_LU -#define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST -#define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST -#define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST -#define f_dSolveFinalize F_DSOLVEFINALIZE -#define f_pdgssvx F_PDGSSVX -#define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX -#define f_check_malloc F_CHECK_MALLOC -#endif - -#if (F77_CALL_C == NOCHANGE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm(...) - */ -/* BLAS */ -#define sasum_ sasum -#define isamax_ isamax -#define scopy_ scopy -#define sscal_ sscal -#define sger_ sger -#define snrm2_ snrm2 -#define ssymv_ ssymv -#define sdot_ sdot -#define saxpy_ saxpy -#define ssyr2_ ssyr2 -#define srot_ srot -#define sgemv_ sgemv -#define strsv_ strsv -#define sgemm_ sgemm -#define strsm_ strsm - -#define dasum_ dasum -#define idamax_ idamax -#define dcopy_ dcopy -#define dscal_ dscal -#define dger_ dger -#define dnrm2_ dnrm2 -#define dsymv_ dsymv -#define ddot_ ddot -#define daxpy_ daxpy -#define dsyr2_ dsyr2 -#define drot_ drot -#define dgemv_ dgemv -#define dtrsv_ dtrsv -#define dgemm_ dgemm -#define dtrsm_ dtrsm - -#define scasum_ scasum -#define icamax_ icamax -#define ccopy_ ccopy -#define cscal_ cscal -#define scnrm2_ scnrm2 -#define caxpy_ caxpy -#define cgemv_ cgemv -#define ctrsv_ ctrsv -#define cgemm_ cgemm -#define ctrsm_ ctrsm -#define cgerc_ cgerc -#define chemv_ chemv -#define cher2_ cher2 - -#define dzasum_ dzasum -#define izamax_ izamax -#define zcopy_ zcopy -#define zscal_ zscal -#define dznrm2_ dznrm2 -#define zaxpy_ zaxpy -#define zgemv_ zgemv -#define ztrsv_ ztrsv -#define zgemm_ zgemm -#define ztrsm_ ztrsm -#define zgerc_ zgerc -#define zhemv_ zhemv -#define zher2_ zher2 -#define zgeru_ zgeru - -/* LAPACK */ -#define dlamch_ dlamch -#define slamch_ slamch -#define xerbla_ xerbla -#define lsame_ lsame - -#define mc64id_ mc64id -#define mc64ad_ mc64ad - -#define c_bridge_dgssv_ c_bridge_dgssv -#define c_fortran_slugrid_ c_fortran_slugrid -#define c_fortran_pdgssvx_ c_fortran_pdgssvx -#define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal -#define c_fortran_pzgssvx_ c_fortran_pzgssvx -#define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle -#define f_create_options_handle f_create_options_handle -#define f_create_ScalePerm_handle f_create_scaleperm_handle -#define f_create_LUstruct_handle f_create_lustruct_handle -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle -#define f_create_SuperMatrix_handle f_create_supermatrix_handle -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle -#define f_destroy_options_handle f_destroy_options_handle -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle -#define f_create_SuperLUStat_handle f_create_superlustat_handle -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle -#define f_get_gridinfo f_get_gridinfo -#define f_get_SuperMatrix f_get_supermatrix -#define f_set_SuperMatrix f_set_supermatrix -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix -#define f_get_superlu_options f_get_superlu_options -#define f_set_superlu_options f_set_superlu_options -#define f_set_default_options f_set_default_options -#define f_superlu_gridinit f_superlu_gridinit -#define f_superlu_gridexit f_superlu_gridexit -#define f_ScalePermstructInit f_scalepermstructinit -#define f_ScalePermstructFree f_scalepermstructfree -#define f_PStatInit f_pstatinit -#define f_PStatFree f_pstatfree -#define f_LUstructInit f_lustructinit -#define f_LUstructFree f_lustructfree -#define f_Destroy_LU f_destroy_lu -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist -#define f_dSolveFinalize f_dsolvefinalize -#define f_pdgssvx f_pdgssvx -#define f_dcreate_dist_matrix f_dcreate_dist_matrix -#define f_check_malloc f_check_malloc -#endif - -#endif /* __SUPERLU_CNAMES */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cscal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cscal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cscal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,70 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int cscal_(integer *n, complex *ca, complex *cx, integer * - incx) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3, i__4; - complex q__1; - - /* Local variables */ - static integer i, nincx; - - -/* scales a vector by a constant. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define CX(I) cx[(I)-1] - - - if (*n <= 0 || *incx <= 0) { - return 0; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - i__3 = i; - i__4 = i; - q__1.r = ca->r * CX(i).r - ca->i * CX(i).i, q__1.i = ca->r * CX( - i).i + ca->i * CX(i).r; - CX(i).r = q__1.r, CX(i).i = q__1.i; -/* L10: */ - } - return 0; - -/* code for increment equal to 1 */ - -L20: - i__2 = *n; - for (i = 1; i <= *n; ++i) { - i__1 = i; - i__3 = i; - q__1.r = ca->r * CX(i).r - ca->i * CX(i).i, q__1.i = ca->r * CX( - i).i + ca->i * CX(i).r; - CX(i).r = q__1.r, CX(i).i = q__1.i; -/* L30: */ - } - return 0; -} /* cscal_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,509 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int ctrsv_(char *uplo, char *trans, char *diag, integer *n, - complex *a, integer *lda, complex *x, integer *incx) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2, q__3; - - /* Builtin functions */ - void c_div(complex *, complex *, complex *), r_cnjg(complex *, complex *); - - /* Local variables */ - static integer info; - static complex temp; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, jx, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical noconj, nounit; - - -/* Purpose - ======= - - CTRSV solves one of the systems of equations - - A*x = b, or A'*x = b, or conjg( A' )*x = b, - - where b and x are n element vectors and A is an n by n unit, or - non-unit, upper or lower triangular matrix. - - No test for singularity or near-singularity is included in this - routine. Such tests must be performed before calling this routine. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the matrix is an upper or - lower triangular matrix as follows: - - UPLO = 'U' or 'u' A is an upper triangular matrix. - - UPLO = 'L' or 'l' A is a lower triangular matrix. - - Unchanged on exit. - - TRANS - CHARACTER*1. - On entry, TRANS specifies the equations to be solved as - follows: - - TRANS = 'N' or 'n' A*x = b. - - TRANS = 'T' or 't' A'*x = b. - - TRANS = 'C' or 'c' conjg( A' )*x = b. - - Unchanged on exit. - - DIAG - CHARACTER*1. - On entry, DIAG specifies whether or not A is unit - triangular as follows: - - DIAG = 'U' or 'u' A is assumed to be unit triangular. - - DIAG = 'N' or 'n' A is not assumed to be unit - triangular. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - A - COMPLEX array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular matrix and the strictly lower triangular part of - - A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular matrix and the strictly upper triangular part of - - A is not referenced. - Note that when DIAG = 'U' or 'u', the diagonal elements of - - A are not referenced either, but are assumed to be unity. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - COMPLEX array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element right-hand side vector b. On exit, X is overwritten - - with the solution vector x. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && - ! lsame_(trans, "C")) { - info = 2; - } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { - info = 3; - } else if (*n < 0) { - info = 4; - } else if (*lda < max(1,*n)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } - if (info != 0) { - xerbla_("CTRSV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0) { - return 0; - } - - noconj = lsame_(trans, "T"); - nounit = lsame_(diag, "N"); - -/* Set up the start point in X if the increment is not unity. This - will be ( N - 1 )*INCX too small for descending loops. */ - - if (*incx <= 0) { - kx = 1 - (*n - 1) * *incx; - } else if (*incx != 1) { - kx = 1; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (lsame_(trans, "N")) { - -/* Form x := inv( A )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - i__1 = j; - if (X(j).r != 0.f || X(j).i != 0.f) { - if (nounit) { - i__1 = j; - c_div(&q__1, &X(j), &A(j,j)); - X(j).r = q__1.r, X(j).i = q__1.i; - } - i__1 = j; - temp.r = X(j).r, temp.i = X(j).i; - for (i = j - 1; i >= 1; --i) { - i__1 = i; - i__2 = i; - i__3 = i + j * a_dim1; - q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - q__1.r = X(i).r - q__2.r, q__1.i = X(i).i - - q__2.i; - X(i).r = q__1.r, X(i).i = q__1.i; -/* L10: */ - } - } -/* L20: */ - } - } else { - jx = kx + (*n - 1) * *incx; - for (j = *n; j >= 1; --j) { - i__1 = jx; - if (X(jx).r != 0.f || X(jx).i != 0.f) { - if (nounit) { - i__1 = jx; - c_div(&q__1, &X(jx), &A(j,j)); - X(jx).r = q__1.r, X(jx).i = q__1.i; - } - i__1 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - ix = jx; - for (i = j - 1; i >= 1; --i) { - ix -= *incx; - i__1 = ix; - i__2 = ix; - i__3 = i + j * a_dim1; - q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - q__1.r = X(ix).r - q__2.r, q__1.i = X(ix).i - - q__2.i; - X(ix).r = q__1.r, X(ix).i = q__1.i; -/* L30: */ - } - } - jx -= *incx; -/* L40: */ - } - } - } else { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - if (X(j).r != 0.f || X(j).i != 0.f) { - if (nounit) { - i__2 = j; - c_div(&q__1, &X(j), &A(j,j)); - X(j).r = q__1.r, X(j).i = q__1.i; - } - i__2 = j; - temp.r = X(j).r, temp.i = X(j).i; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - q__1.r = X(i).r - q__2.r, q__1.i = X(i).i - - q__2.i; - X(i).r = q__1.r, X(i).i = q__1.i; -/* L50: */ - } - } -/* L60: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - if (X(jx).r != 0.f || X(jx).i != 0.f) { - if (nounit) { - i__2 = jx; - c_div(&q__1, &X(jx), &A(j,j)); - X(jx).r = q__1.r, X(jx).i = q__1.i; - } - i__2 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - ix = jx; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - i__3 = ix; - i__4 = ix; - i__5 = i + j * a_dim1; - q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - q__1.r = X(ix).r - q__2.r, q__1.i = X(ix).i - - q__2.i; - X(ix).r = q__1.r, X(ix).i = q__1.i; -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } - } else { - -/* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - temp.r = X(j).r, temp.i = X(j).i; - if (noconj) { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = i; - q__2.r = A(i,j).r * X(i).r - A(i,j).i * X( - i).i, q__2.i = A(i,j).r * X(i).i + - A(i,j).i * X(i).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; -/* L90: */ - } - if (nounit) { - c_div(&q__1, &temp, &A(j,j)); - temp.r = q__1.r, temp.i = q__1.i; - } - } else { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - r_cnjg(&q__3, &A(i,j)); - i__3 = i; - q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, - q__2.i = q__3.r * X(i).i + q__3.i * X( - i).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; -/* L100: */ - } - if (nounit) { - r_cnjg(&q__2, &A(j,j)); - c_div(&q__1, &temp, &q__2); - temp.r = q__1.r, temp.i = q__1.i; - } - } - i__2 = j; - X(j).r = temp.r, X(j).i = temp.i; -/* L110: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - ix = kx; - i__2 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - if (noconj) { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = ix; - q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X( - ix).i, q__2.i = A(i,j).r * X(ix).i + - A(i,j).i * X(ix).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; - ix += *incx; -/* L120: */ - } - if (nounit) { - c_div(&q__1, &temp, &A(j,j)); - temp.r = q__1.r, temp.i = q__1.i; - } - } else { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - r_cnjg(&q__3, &A(i,j)); - i__3 = ix; - q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, - q__2.i = q__3.r * X(ix).i + q__3.i * X( - ix).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; - ix += *incx; -/* L130: */ - } - if (nounit) { - r_cnjg(&q__2, &A(j,j)); - c_div(&q__1, &temp, &q__2); - temp.r = q__1.r, temp.i = q__1.i; - } - } - i__2 = jx; - X(jx).r = temp.r, X(jx).i = temp.i; - jx += *incx; -/* L140: */ - } - } - } else { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - i__1 = j; - temp.r = X(j).r, temp.i = X(j).i; - if (noconj) { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - i__2 = i + j * a_dim1; - i__3 = i; - q__2.r = A(i,j).r * X(i).r - A(i,j).i * X( - i).i, q__2.i = A(i,j).r * X(i).i + - A(i,j).i * X(i).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; -/* L150: */ - } - if (nounit) { - c_div(&q__1, &temp, &A(j,j)); - temp.r = q__1.r, temp.i = q__1.i; - } - } else { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - r_cnjg(&q__3, &A(i,j)); - i__2 = i; - q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, - q__2.i = q__3.r * X(i).i + q__3.i * X( - i).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; -/* L160: */ - } - if (nounit) { - r_cnjg(&q__2, &A(j,j)); - c_div(&q__1, &temp, &q__2); - temp.r = q__1.r, temp.i = q__1.i; - } - } - i__1 = j; - X(j).r = temp.r, X(j).i = temp.i; -/* L170: */ - } - } else { - kx += (*n - 1) * *incx; - jx = kx; - for (j = *n; j >= 1; --j) { - ix = kx; - i__1 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - if (noconj) { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - i__2 = i + j * a_dim1; - i__3 = ix; - q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X( - ix).i, q__2.i = A(i,j).r * X(ix).i + - A(i,j).i * X(ix).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; - ix -= *incx; -/* L180: */ - } - if (nounit) { - c_div(&q__1, &temp, &A(j,j)); - temp.r = q__1.r, temp.i = q__1.i; - } - } else { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - r_cnjg(&q__3, &A(i,j)); - i__2 = ix; - q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, - q__2.i = q__3.r * X(ix).i + q__3.i * X( - ix).r; - q__1.r = temp.r - q__2.r, q__1.i = temp.i - - q__2.i; - temp.r = q__1.r, temp.i = q__1.i; - ix -= *incx; -/* L190: */ - } - if (nounit) { - r_cnjg(&q__2, &A(j,j)); - c_div(&q__1, &temp, &q__2); - temp.r = q__1.r, temp.i = q__1.i; - } - } - i__1 = jx; - X(jx).r = temp.r, X(jx).i = temp.i; - jx -= *incx; -/* L200: */ - } - } - } - } - - return 0; - -/* End of CTRSV . */ - -} /* ctrsv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dasum.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dasum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dasum.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,88 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -doublereal dasum_(integer *n, doublereal *dx, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2; - doublereal ret_val, d__1, d__2, d__3, d__4, d__5, d__6; - - /* Local variables */ - static integer i, m; - static doublereal dtemp; - static integer nincx, mp1; - - -/* takes the sum of the absolute values. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DX(I) dx[(I)-1] - - - ret_val = 0.; - dtemp = 0.; - if (*n <= 0 || *incx <= 0) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - dtemp += (d__1 = DX(i), abs(d__1)); -/* L10: */ - } - ret_val = dtemp; - return ret_val; - -/* code for increment equal to 1 - - - clean-up loop */ - -L20: - m = *n % 6; - if (m == 0) { - goto L40; - } - i__2 = m; - for (i = 1; i <= m; ++i) { - dtemp += (d__1 = DX(i), abs(d__1)); -/* L30: */ - } - if (*n < 6) { - goto L60; - } -L40: - mp1 = m + 1; - i__2 = *n; - for (i = mp1; i <= *n; i += 6) { - dtemp = dtemp + (d__1 = DX(i), abs(d__1)) + (d__2 = DX(i + 1), abs( - d__2)) + (d__3 = DX(i + 2), abs(d__3)) + (d__4 = DX(i + 3), - abs(d__4)) + (d__5 = DX(i + 4), abs(d__5)) + (d__6 = DX(i + 5) - , abs(d__6)); -/* L50: */ - } -L60: - ret_val = dtemp; - return ret_val; -} /* dasum_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/daxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/daxpy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/daxpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/daxpy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int daxpy_(integer *n, doublereal *da, doublereal *dx, - integer *incx, doublereal *dy, integer *incy) -{ - - - /* System generated locals */ - integer i__1; - - /* Local variables */ - static integer i, m, ix, iy, mp1; - - -/* constant times a vector plus a vector. - uses unrolled loops for increments equal to one. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DY(I) dy[(I)-1] -#define DX(I) dx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*da == 0.) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - DY(iy) += *da * DX(ix); - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 - - - clean-up loop */ - -L20: - m = *n % 4; - if (m == 0) { - goto L40; - } - i__1 = m; - for (i = 1; i <= m; ++i) { - DY(i) += *da * DX(i); -/* L30: */ - } - if (*n < 4) { - return 0; - } -L40: - mp1 = m + 1; - i__1 = *n; - for (i = mp1; i <= *n; i += 4) { - DY(i) += *da * DX(i); - DY(i + 1) += *da * DX(i + 1); - DY(i + 2) += *da * DX(i + 2); - DY(i + 3) += *da * DX(i + 3); -/* L50: */ - } - return 0; -} /* daxpy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -doublereal dcabs1_(doublecomplex *z) -{ -/* >>Start of File<< - - System generated locals */ - doublereal ret_val; - static doublecomplex equiv_0[1]; - - /* Local variables */ -#define t ((doublereal *)equiv_0) -#define zz (equiv_0) - - zz->r = z->r, zz->i = z->i; - ret_val = abs(t[0]) + abs(t[1]); - return ret_val; -} /* dcabs1_ */ - -#undef zz -#undef t - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcopy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcopy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcopy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dcopy_(integer *n, doublereal *dx, integer *incx, - doublereal *dy, integer *incy) -{ - - - /* System generated locals */ - integer i__1; - - /* Local variables */ - static integer i, m, ix, iy, mp1; - - -/* copies a vector, x, to a vector, y. - uses unrolled loops for increments equal to one. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DY(I) dy[(I)-1] -#define DX(I) dx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - DY(iy) = DX(ix); - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 - - - clean-up loop */ - -L20: - m = *n % 7; - if (m == 0) { - goto L40; - } - i__1 = m; - for (i = 1; i <= m; ++i) { - DY(i) = DX(i); -/* L30: */ - } - if (*n < 7) { - return 0; - } -L40: - mp1 = m + 1; - i__1 = *n; - for (i = mp1; i <= *n; i += 7) { - DY(i) = DX(i); - DY(i + 1) = DX(i + 1); - DY(i + 2) = DX(i + 2); - DY(i + 3) = DX(i + 3); - DY(i + 4) = DX(i + 4); - DY(i + 5) = DX(i + 5); - DY(i + 6) = DX(i + 6); -/* L50: */ - } - return 0; -} /* dcopy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ddot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ddot.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ddot.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ddot.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -doublereal ddot_(integer *n, doublereal *dx, integer *incx, doublereal *dy, - integer *incy) -{ - - - /* System generated locals */ - integer i__1; - doublereal ret_val; - - /* Local variables */ - static integer i, m; - static doublereal dtemp; - static integer ix, iy, mp1; - - -/* forms the dot product of two vectors. - uses unrolled loops for increments equal to one. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DY(I) dy[(I)-1] -#define DX(I) dx[(I)-1] - - - ret_val = 0.; - dtemp = 0.; - if (*n <= 0) { - return ret_val; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - dtemp += DX(ix) * DY(iy); - ix += *incx; - iy += *incy; -/* L10: */ - } - ret_val = dtemp; - return ret_val; - -/* code for both increments equal to 1 - - - clean-up loop */ - -L20: - m = *n % 5; - if (m == 0) { - goto L40; - } - i__1 = m; - for (i = 1; i <= m; ++i) { - dtemp += DX(i) * DY(i); -/* L30: */ - } - if (*n < 5) { - goto L60; - } -L40: - mp1 = m + 1; - i__1 = *n; - for (i = mp1; i <= *n; i += 5) { - dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) * - DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4); -/* L50: */ - } -L60: - ret_val = dtemp; - return ret_val; -} /* ddot_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,395 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dgemm_(char *transa, char *transb, integer *m, integer * - n, integer *k, doublereal *alpha, doublereal *a, integer *lda, - doublereal *b, integer *ldb, doublereal *beta, doublereal *c, integer - *ldc) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, - i__3; - - /* Local variables */ - static integer info; - static logical nota, notb; - static doublereal temp; - static integer i, j, l, ncola; - extern logical lsame_(char *, char *); - static integer nrowa, nrowb; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - DGEMM performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X', - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - CHARACTER*1. - On entry, TRANSA specifies the form of op( A ) to be used in - - the matrix multiplication as follows: - - TRANSA = 'N' or 'n', op( A ) = A. - - TRANSA = 'T' or 't', op( A ) = A'. - - TRANSA = 'C' or 'c', op( A ) = A'. - - Unchanged on exit. - - TRANSB - CHARACTER*1. - On entry, TRANSB specifies the form of op( B ) to be used in - - the matrix multiplication as follows: - - TRANSB = 'N' or 'n', op( B ) = B. - - TRANSB = 'T' or 't', op( B ) = B'. - - TRANSB = 'C' or 'c', op( B ) = B'. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of the matrix - - op( A ) and of the matrix C. M must be at least zero. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix - - op( B ) and the number of columns of the matrix C. N must be - - at least zero. - Unchanged on exit. - - K - INTEGER. - On entry, K specifies the number of columns of the matrix - - op( A ) and the number of rows of the matrix op( B ). K must - - be at least zero. - Unchanged on exit. - - ALPHA - DOUBLE PRECISION. - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is - - k when TRANSA = 'N' or 'n', and is m otherwise. - Before entry with TRANSA = 'N' or 'n', the leading m by k - - part of the array A must contain the matrix A, otherwise - - the leading k by m part of the array A must contain the - - matrix A. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. When TRANSA = 'N' or 'n' then - - LDA must be at least max( 1, m ), otherwise LDA must be at - - least max( 1, k ). - Unchanged on exit. - - B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is - - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - - part of the array B must contain the matrix B, otherwise - - the leading n by k part of the array B must contain the - - matrix B. - Unchanged on exit. - - LDB - INTEGER. - On entry, LDB specifies the first dimension of B as declared - - in the calling (sub) program. When TRANSB = 'N' or 'n' then - - LDB must be at least max( 1, k ), otherwise LDB must be at - - least max( 1, n ). - Unchanged on exit. - - BETA - DOUBLE PRECISION. - On entry, BETA specifies the scalar beta. When BETA is - - supplied as zero then C need not be set on input. - Unchanged on exit. - - C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - - contain the matrix C, except when beta is zero, in which - - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - - ( alpha*op( A )*op( B ) + beta*C ). - - LDC - INTEGER. - On entry, LDC specifies the first dimension of C as declared - - in the calling (sub) program. LDC must be at least - - max( 1, m ). - Unchanged on exit. - - - Level 3 Blas routine. - - -- Written on 8-February-1989. - Jack Dongarra, Argonne National Laboratory. - Iain Duff, AERE Harwell. - Jeremy Du Croz, Numerical Algorithms Group Ltd. - Sven Hammarling, Numerical Algorithms Group Ltd. - - - - Set NOTA and NOTB as true if A and B respectively are not - - transposed and set NROWA, NCOLA and NROWB as the number of rows - - and columns of A and the number of rows of B respectively. - - - - Parameter adjustments - Function Body */ - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] -#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] -#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)] - - nota = lsame_(transa, "N"); - notb = lsame_(transb, "N"); - if (nota) { - nrowa = *m; - ncola = *k; - } else { - nrowa = *k; - ncola = *m; - } - if (notb) { - nrowb = *k; - } else { - nrowb = *n; - } - -/* Test the input parameters. */ - - info = 0; - if (! nota && ! lsame_(transa, "C") && ! lsame_(transa, "T")) { - info = 1; - } else if (! notb && ! lsame_(transb, "C") && ! lsame_(transb, - "T")) { - info = 2; - } else if (*m < 0) { - info = 3; - } else if (*n < 0) { - info = 4; - } else if (*k < 0) { - info = 5; - } else if (*lda < max(1,nrowa)) { - info = 8; - } else if (*ldb < max(1,nrowb)) { - info = 10; - } else if (*ldc < max(1,*m)) { - info = 13; - } - if (info != 0) { - xerbla_("DGEMM ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) { - return 0; - } - -/* And if alpha.eq.zero. */ - - if (*alpha == 0.) { - if (*beta == 0.) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) = 0.; -/* L10: */ - } -/* L20: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) = *beta * C(i,j); -/* L30: */ - } -/* L40: */ - } - } - return 0; - } - -/* Start the operations. */ - - if (notb) { - if (nota) { - -/* Form C := alpha*A*B + beta*C. */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (*beta == 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) = 0.; -/* L50: */ - } - } else if (*beta != 1.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) = *beta * C(i,j); -/* L60: */ - } - } - i__2 = *k; - for (l = 1; l <= *k; ++l) { - if (B(l,j) != 0.) { - temp = *alpha * B(l,j); - i__3 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) += temp * A(i,l); -/* L70: */ - } - } -/* L80: */ - } -/* L90: */ - } - } else { - -/* Form C := alpha*A'*B + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - temp += A(l,i) * B(l,j); -/* L100: */ - } - if (*beta == 0.) { - C(i,j) = *alpha * temp; - } else { - C(i,j) = *alpha * temp + *beta * C(i,j); - } -/* L110: */ - } -/* L120: */ - } - } - } else { - if (nota) { - -/* Form C := alpha*A*B' + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (*beta == 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) = 0.; -/* L130: */ - } - } else if (*beta != 1.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) = *beta * C(i,j); -/* L140: */ - } - } - i__2 = *k; - for (l = 1; l <= *k; ++l) { - if (B(j,l) != 0.) { - temp = *alpha * B(j,l); - i__3 = *m; - for (i = 1; i <= *m; ++i) { - C(i,j) += temp * A(i,l); -/* L150: */ - } - } -/* L160: */ - } -/* L170: */ - } - } else { - -/* Form C := alpha*A'*B' + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - temp += A(l,i) * B(j,l); -/* L180: */ - } - if (*beta == 0.) { - C(i,j) = *alpha * temp; - } else { - C(i,j) = *alpha * temp + *beta * C(i,j); - } -/* L190: */ - } -/* L200: */ - } - } - } - - return 0; - -/* End of DGEMM . */ - -} /* dgemm_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,299 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dgemv_(char *trans, integer *m, integer *n, doublereal * - alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, - doublereal *beta, doublereal *y, integer *incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static doublereal temp; - static integer lenx, leny, i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - DGEMV performs one of the matrix-vector operations - - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - - where alpha and beta are scalars, x and y are vectors and A is an - m by n matrix. - - Parameters - ========== - - TRANS - CHARACTER*1. - On entry, TRANS specifies the operation to be performed as - follows: - - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - DOUBLE PRECISION. - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - X - DOUBLE PRECISION array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - DOUBLE PRECISION. - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - DOUBLE PRECISION array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - - updated vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! - lsame_(trans, "C")) { - info = 1; - } else if (*m < 0) { - info = 2; - } else if (*n < 0) { - info = 3; - } else if (*lda < max(1,*m)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } else if (*incy == 0) { - info = 11; - } - if (info != 0) { - xerbla_("DGEMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) { - return 0; - } - -/* Set LENX and LENY, the lengths of the vectors x and y, and set - - up the start points in X and Y. */ - - if (lsame_(trans, "N")) { - lenx = *n; - leny = *m; - } else { - lenx = *m; - leny = *n; - } - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (lenx - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (leny - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. - - First form y := beta*y. */ - - if (*beta != 1.) { - if (*incy == 1) { - if (*beta == 0.) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(i) = 0.; -/* L10: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(i) = *beta * Y(i); -/* L20: */ - } - } - } else { - iy = ky; - if (*beta == 0.) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(iy) = 0.; - iy += *incy; -/* L30: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(iy) = *beta * Y(iy); - iy += *incy; -/* L40: */ - } - } - } - } - if (*alpha == 0.) { - return 0; - } - if (lsame_(trans, "N")) { - -/* Form y := alpha*A*x + y. */ - - jx = kx; - if (*incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.) { - temp = *alpha * X(jx); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - Y(i) += temp * A(i,j); -/* L50: */ - } - } - jx += *incx; -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.) { - temp = *alpha * X(jx); - iy = ky; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - Y(iy) += temp * A(i,j); - iy += *incy; -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } else { - -/* Form y := alpha*A'*x + y. */ - - jy = ky; - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = 0.; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp += A(i,j) * X(i); -/* L90: */ - } - Y(jy) += *alpha * temp; - jy += *incy; -/* L100: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = 0.; - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp += A(i,j) * X(ix); - ix += *incx; -/* L110: */ - } - Y(jy) += *alpha * temp; - jy += *incy; -/* L120: */ - } - } - } - - return 0; - -/* End of DGEMV . */ - -} /* dgemv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dger.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dger.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dger.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dger.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,182 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dger_(integer *m, integer *n, doublereal *alpha, - doublereal *x, integer *incx, doublereal *y, integer *incy, - doublereal *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static doublereal temp; - static integer i, j, ix, jy, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - DGER performs the rank 1 operation - - A := alpha*x*y' + A, - - where alpha is a scalar, x is an m element vector, y is an n element - - vector and A is an m by n matrix. - - Parameters - ========== - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - DOUBLE PRECISION. - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - DOUBLE PRECISION array of dimension at least - ( 1 + ( m - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the m - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - DOUBLE PRECISION array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. On exit, A is - overwritten by the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (*m < 0) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*m)) { - info = 9; - } - if (info != 0) { - xerbla_("DGER ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || *alpha == 0.) { - return 0; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (*incy > 0) { - jy = 1; - } else { - jy = 1 - (*n - 1) * *incy; - } - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (Y(jy) != 0.) { - temp = *alpha * Y(jy); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - A(i,j) += X(i) * temp; -/* L10: */ - } - } - jy += *incy; -/* L20: */ - } - } else { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*m - 1) * *incx; - } - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (Y(jy) != 0.) { - temp = *alpha * Y(jy); - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - A(i,j) += X(ix) * temp; - ix += *incx; -/* L30: */ - } - } - jy += *incy; -/* L40: */ - } - } - - return 0; - -/* End of DGER . */ - -} /* dger_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,83 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -doublereal dnrm2_(integer *n, doublereal *x, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2; - doublereal ret_val, d__1; - - /* Builtin functions */ - double sqrt(doublereal); - - /* Local variables */ - static doublereal norm, scale, absxi; - static integer ix; - static doublereal ssq; - - -/* DNRM2 returns the euclidean norm of a vector via the function - name, so that - - DNRM2 := sqrt( x'*x ) - - - - -- This version written on 25-October-1982. - Modified on 14-October-1993 to inline the call to DLASSQ. - Sven Hammarling, Nag Ltd. - - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - - - if (*n < 1 || *incx < 1) { - norm = 0.; - } else if (*n == 1) { - norm = abs(X(1)); - } else { - scale = 0.; - ssq = 1.; -/* The following loop is equivalent to this call to the LAPACK - - auxiliary routine: - CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */ - - i__1 = (*n - 1) * *incx + 1; - i__2 = *incx; - for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) { - if (X(ix) != 0.) { - absxi = (d__1 = X(ix), abs(d__1)); - if (scale < absxi) { -/* Computing 2nd power */ - d__1 = scale / absxi; - ssq = ssq * (d__1 * d__1) + 1.; - scale = absxi; - } else { -/* Computing 2nd power */ - d__1 = absxi / scale; - ssq += d__1 * d__1; - } - } -/* L10: */ - } - norm = scale * sqrt(ssq); - } - - ret_val = norm; - return ret_val; - -/* End of DNRM2. */ - -} /* dnrm2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/drot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/drot.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/drot.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/drot.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,76 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int drot_(integer *n, doublereal *dx, integer *incx, - doublereal *dy, integer *incy, doublereal *c, doublereal *s) -{ - - - /* System generated locals */ - integer i__1; - - /* Local variables */ - static integer i; - static doublereal dtemp; - static integer ix, iy; - - -/* applies a plane rotation. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DY(I) dy[(I)-1] -#define DX(I) dx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments not equal - to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - dtemp = *c * DX(ix) + *s * DY(iy); - DY(iy) = *c * DY(iy) - *s * DX(ix); - DX(ix) = dtemp; - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - dtemp = *c * DX(i) + *s * DY(i); - DY(i) = *c * DY(i) - *s * DX(i); - DX(i) = dtemp; -/* L30: */ - } - return 0; -} /* drot_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dscal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dscal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dscal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,83 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dscal_(integer *n, doublereal *da, doublereal *dx, - integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2; - - /* Local variables */ - static integer i, m, nincx, mp1; - - -/* scales a vector by a constant. - uses unrolled loops for increment equal to one. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DX(I) dx[(I)-1] - - - if (*n <= 0 || *incx <= 0) { - return 0; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - DX(i) = *da * DX(i); -/* L10: */ - } - return 0; - -/* code for increment equal to 1 - - - clean-up loop */ - -L20: - m = *n % 5; - if (m == 0) { - goto L40; - } - i__2 = m; - for (i = 1; i <= m; ++i) { - DX(i) = *da * DX(i); -/* L30: */ - } - if (*n < 5) { - return 0; - } -L40: - mp1 = m + 1; - i__2 = *n; - for (i = mp1; i <= *n; i += 5) { - DX(i) = *da * DX(i); - DX(i + 1) = *da * DX(i + 1); - DX(i + 2) = *da * DX(i + 2); - DX(i + 3) = *da * DX(i + 3); - DX(i + 4) = *da * DX(i + 4); -/* L50: */ - } - return 0; -} /* dscal_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsymv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsymv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsymv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsymv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,300 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dsymv_(char *uplo, integer *n, doublereal *alpha, - doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal - *beta, doublereal *y, integer *incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static doublereal temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - DSYMV performs the matrix-vector operation - - y := alpha*A*x + beta*y, - - where alpha and beta are scalars, x and y are n element vectors and - A is an n by n symmetric matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - DOUBLE PRECISION. - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the symmetric matrix and the strictly - lower triangular part of A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the symmetric matrix and the strictly - upper triangular part of A is not referenced. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - DOUBLE PRECISION array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - DOUBLE PRECISION. - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - DOUBLE PRECISION array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. On exit, Y is overwritten by the updated - vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*lda < max(1,*n)) { - info = 5; - } else if (*incx == 0) { - info = 7; - } else if (*incy == 0) { - info = 10; - } - if (info != 0) { - xerbla_("DSYMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || *alpha == 0. && *beta == 1.) { - return 0; - } - -/* Set up the start points in X and Y. */ - - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. - - First form y := beta*y. */ - - if (*beta != 1.) { - if (*incy == 1) { - if (*beta == 0.) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(i) = 0.; -/* L10: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(i) = *beta * Y(i); -/* L20: */ - } - } - } else { - iy = ky; - if (*beta == 0.) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(iy) = 0.; - iy += *incy; -/* L30: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(iy) = *beta * Y(iy); - iy += *incy; -/* L40: */ - } - } - } - } - if (*alpha == 0.) { - return 0; - } - if (lsame_(uplo, "U")) { - -/* Form y when A is stored in upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(j); - temp2 = 0.; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - Y(i) += temp1 * A(i,j); - temp2 += A(i,j) * X(i); -/* L50: */ - } - Y(j) = Y(j) + temp1 * A(j,j) + *alpha * temp2; -/* L60: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(jx); - temp2 = 0.; - ix = kx; - iy = ky; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - Y(iy) += temp1 * A(i,j); - temp2 += A(i,j) * X(ix); - ix += *incx; - iy += *incy; -/* L70: */ - } - Y(jy) = Y(jy) + temp1 * A(j,j) + *alpha * temp2; - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } else { - -/* Form y when A is stored in lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(j); - temp2 = 0.; - Y(j) += temp1 * A(j,j); - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - Y(i) += temp1 * A(i,j); - temp2 += A(i,j) * X(i); -/* L90: */ - } - Y(j) += *alpha * temp2; -/* L100: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(jx); - temp2 = 0.; - Y(jy) += temp1 * A(j,j); - ix = jx; - iy = jy; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - iy += *incy; - Y(iy) += temp1 * A(i,j); - temp2 += A(i,j) * X(ix); -/* L110: */ - } - Y(jy) += *alpha * temp2; - jx += *incx; - jy += *incy; -/* L120: */ - } - } - } - - return 0; - -/* End of DSYMV . */ - -} /* dsymv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,264 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dsyr2_(char *uplo, integer *n, doublereal *alpha, - doublereal *x, integer *incx, doublereal *y, integer *incy, - doublereal *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static doublereal temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - DSYR2 performs the symmetric rank 2 operation - - A := alpha*x*y' + alpha*y*x' + A, - - where alpha is a scalar, x and y are n element vectors and A is an n - - by n symmetric matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - DOUBLE PRECISION. - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - DOUBLE PRECISION array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - DOUBLE PRECISION array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the symmetric matrix and the strictly - lower triangular part of A is not referenced. On exit, the - upper triangular part of the array A is overwritten by the - upper triangular part of the updated matrix. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the symmetric matrix and the strictly - upper triangular part of A is not referenced. On exit, the - lower triangular part of the array A is overwritten by the - lower triangular part of the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*n)) { - info = 9; - } - if (info != 0) { - xerbla_("DSYR2 ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || *alpha == 0.) { - return 0; - } - -/* Set up the start points in X and Y if the increments are not both - - unity. */ - - if (*incx != 1 || *incy != 1) { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - jx = kx; - jy = ky; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. */ - - if (lsame_(uplo, "U")) { - -/* Form A when A is stored in the upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(j) != 0. || Y(j) != 0.) { - temp1 = *alpha * Y(j); - temp2 = *alpha * X(j); - i__2 = j; - for (i = 1; i <= j; ++i) { - A(i,j) = A(i,j) + X(i) * temp1 - + Y(i) * temp2; -/* L10: */ - } - } -/* L20: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0. || Y(jy) != 0.) { - temp1 = *alpha * Y(jy); - temp2 = *alpha * X(jx); - ix = kx; - iy = ky; - i__2 = j; - for (i = 1; i <= j; ++i) { - A(i,j) = A(i,j) + X(ix) * temp1 - + Y(iy) * temp2; - ix += *incx; - iy += *incy; -/* L30: */ - } - } - jx += *incx; - jy += *incy; -/* L40: */ - } - } - } else { - -/* Form A when A is stored in the lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(j) != 0. || Y(j) != 0.) { - temp1 = *alpha * Y(j); - temp2 = *alpha * X(j); - i__2 = *n; - for (i = j; i <= *n; ++i) { - A(i,j) = A(i,j) + X(i) * temp1 - + Y(i) * temp2; -/* L50: */ - } - } -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0. || Y(jy) != 0.) { - temp1 = *alpha * Y(jy); - temp2 = *alpha * X(jx); - ix = jx; - iy = jy; - i__2 = *n; - for (i = j; i <= *n; ++i) { - A(i,j) = A(i,j) + X(ix) * temp1 - + Y(iy) * temp2; - ix += *incx; - iy += *incy; -/* L70: */ - } - } - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } - - return 0; - -/* End of DSYR2 . */ - -} /* dsyr2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,485 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dtrsm_(char *side, char *uplo, char *transa, char *diag, - integer *m, integer *n, doublereal *alpha, doublereal *a, integer * - lda, doublereal *b, integer *ldb) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; - - /* Local variables */ - static integer info; - static doublereal temp; - static integer i, j, k; - static logical lside; - extern logical lsame_(char *, char *); - static integer nrowa; - static logical upper; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical nounit; - - -/* Purpose - ======= - - DTRSM solves one of the matrix equations - - op( A )*X = alpha*B, or X*op( A ) = alpha*B, - - where alpha is a scalar, X and B are m by n matrices, A is a unit, or - - non-unit, upper or lower triangular matrix and op( A ) is one of - - - op( A ) = A or op( A ) = A'. - - The matrix X is overwritten on B. - - Parameters - ========== - - SIDE - CHARACTER*1. - On entry, SIDE specifies whether op( A ) appears on the left - - or right of X as follows: - - SIDE = 'L' or 'l' op( A )*X = alpha*B. - - SIDE = 'R' or 'r' X*op( A ) = alpha*B. - - Unchanged on exit. - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the matrix A is an upper or - - lower triangular matrix as follows: - - UPLO = 'U' or 'u' A is an upper triangular matrix. - - UPLO = 'L' or 'l' A is a lower triangular matrix. - - Unchanged on exit. - - TRANSA - CHARACTER*1. - On entry, TRANSA specifies the form of op( A ) to be used in - - the matrix multiplication as follows: - - TRANSA = 'N' or 'n' op( A ) = A. - - TRANSA = 'T' or 't' op( A ) = A'. - - TRANSA = 'C' or 'c' op( A ) = A'. - - Unchanged on exit. - - DIAG - CHARACTER*1. - On entry, DIAG specifies whether or not A is unit triangular - - as follows: - - DIAG = 'U' or 'u' A is assumed to be unit triangular. - - DIAG = 'N' or 'n' A is not assumed to be unit - triangular. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of B. M must be at - - least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of B. N must be - - at least zero. - Unchanged on exit. - - ALPHA - DOUBLE PRECISION. - On entry, ALPHA specifies the scalar alpha. When alpha is - - zero then A is not referenced and B need not be set before - - entry. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m - - when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. - - Before entry with UPLO = 'U' or 'u', the leading k by k - - upper triangular part of the array A must contain the upper - - triangular matrix and the strictly lower triangular part of - - A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading k by k - - lower triangular part of the array A must contain the lower - - triangular matrix and the strictly upper triangular part of - - A is not referenced. - Note that when DIAG = 'U' or 'u', the diagonal elements of - - A are not referenced either, but are assumed to be unity. - - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. When SIDE = 'L' or 'l' then - - LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' - - then LDA must be at least max( 1, n ). - Unchanged on exit. - - B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). - Before entry, the leading m by n part of the array B must - - contain the right-hand side matrix B, and on exit is - - overwritten by the solution matrix X. - - LDB - INTEGER. - On entry, LDB specifies the first dimension of B as declared - - in the calling (sub) program. LDB must be at least - - max( 1, m ). - Unchanged on exit. - - - Level 3 Blas routine. - - - -- Written on 8-February-1989. - Jack Dongarra, Argonne National Laboratory. - Iain Duff, AERE Harwell. - Jeremy Du Croz, Numerical Algorithms Group Ltd. - Sven Hammarling, Numerical Algorithms Group Ltd. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] -#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] - - lside = lsame_(side, "L"); - if (lside) { - nrowa = *m; - } else { - nrowa = *n; - } - nounit = lsame_(diag, "N"); - upper = lsame_(uplo, "U"); - - info = 0; - if (! lside && ! lsame_(side, "R")) { - info = 1; - } else if (! upper && ! lsame_(uplo, "L")) { - info = 2; - } else if (! lsame_(transa, "N") && ! lsame_(transa, "T") - && ! lsame_(transa, "C")) { - info = 3; - } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { - info = 4; - } else if (*m < 0) { - info = 5; - } else if (*n < 0) { - info = 6; - } else if (*lda < max(1,nrowa)) { - info = 9; - } else if (*ldb < max(1,*m)) { - info = 11; - } - if (info != 0) { - xerbla_("DTRSM ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0) { - return 0; - } - -/* And when alpha.eq.zero. */ - - if (*alpha == 0.) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = 0.; -/* L10: */ - } -/* L20: */ - } - return 0; - } - -/* Start the operations. */ - - if (lside) { - if (lsame_(transa, "N")) { - -/* Form B := alpha*inv( A )*B. */ - - if (upper) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (*alpha != 1.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = *alpha * B(i,j); -/* L30: */ - } - } - for (k = *m; k >= 1; --k) { - if (B(k,j) != 0.) { - if (nounit) { - B(k,j) /= A(k,k); - } - i__2 = k - 1; - for (i = 1; i <= k-1; ++i) { - B(i,j) -= B(k,j) * A(i,k); -/* L40: */ - } - } -/* L50: */ - } -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (*alpha != 1.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = *alpha * B(i,j); -/* L70: */ - } - } - i__2 = *m; - for (k = 1; k <= *m; ++k) { - if (B(k,j) != 0.) { - if (nounit) { - B(k,j) /= A(k,k); - } - i__3 = *m; - for (i = k + 1; i <= *m; ++i) { - B(i,j) -= B(k,j) * A(i,k); -/* L80: */ - } - } -/* L90: */ - } -/* L100: */ - } - } - } else { - -/* Form B := alpha*inv( A' )*B. */ - - if (upper) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp = *alpha * B(i,j); - i__3 = i - 1; - for (k = 1; k <= i-1; ++k) { - temp -= A(k,i) * B(k,j); -/* L110: */ - } - if (nounit) { - temp /= A(i,i); - } - B(i,j) = temp; -/* L120: */ - } -/* L130: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - for (i = *m; i >= 1; --i) { - temp = *alpha * B(i,j); - i__2 = *m; - for (k = i + 1; k <= *m; ++k) { - temp -= A(k,i) * B(k,j); -/* L140: */ - } - if (nounit) { - temp /= A(i,i); - } - B(i,j) = temp; -/* L150: */ - } -/* L160: */ - } - } - } - } else { - if (lsame_(transa, "N")) { - -/* Form B := alpha*B*inv( A ). */ - - if (upper) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (*alpha != 1.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = *alpha * B(i,j); -/* L170: */ - } - } - i__2 = j - 1; - for (k = 1; k <= j-1; ++k) { - if (A(k,j) != 0.) { - i__3 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) -= A(k,j) * B(i,k); -/* L180: */ - } - } -/* L190: */ - } - if (nounit) { - temp = 1. / A(j,j); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = temp * B(i,j); -/* L200: */ - } - } -/* L210: */ - } - } else { - for (j = *n; j >= 1; --j) { - if (*alpha != 1.) { - i__1 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = *alpha * B(i,j); -/* L220: */ - } - } - i__1 = *n; - for (k = j + 1; k <= *n; ++k) { - if (A(k,j) != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) -= A(k,j) * B(i,k); -/* L230: */ - } - } -/* L240: */ - } - if (nounit) { - temp = 1. / A(j,j); - i__1 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) = temp * B(i,j); -/* L250: */ - } - } -/* L260: */ - } - } - } else { - -/* Form B := alpha*B*inv( A' ). */ - - if (upper) { - for (k = *n; k >= 1; --k) { - if (nounit) { - temp = 1. / A(k,k); - i__1 = *m; - for (i = 1; i <= *m; ++i) { - B(i,k) = temp * B(i,k); -/* L270: */ - } - } - i__1 = k - 1; - for (j = 1; j <= k-1; ++j) { - if (A(j,k) != 0.) { - temp = A(j,k); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) -= temp * B(i,k); -/* L280: */ - } - } -/* L290: */ - } - if (*alpha != 1.) { - i__1 = *m; - for (i = 1; i <= *m; ++i) { - B(i,k) = *alpha * B(i,k); -/* L300: */ - } - } -/* L310: */ - } - } else { - i__1 = *n; - for (k = 1; k <= *n; ++k) { - if (nounit) { - temp = 1. / A(k,k); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,k) = temp * B(i,k); -/* L320: */ - } - } - i__2 = *n; - for (j = k + 1; j <= *n; ++j) { - if (A(j,k) != 0.) { - temp = A(j,k); - i__3 = *m; - for (i = 1; i <= *m; ++i) { - B(i,j) -= temp * B(i,k); -/* L330: */ - } - } -/* L340: */ - } - if (*alpha != 1.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - B(i,k) = *alpha * B(i,k); -/* L350: */ - } - } -/* L360: */ - } - } - } - } - - return 0; - -/* End of DTRSM . */ - -} /* dtrsm_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,338 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int dtrsv_(char *uplo, char *trans, char *diag, integer *n, - doublereal *a, integer *lda, doublereal *x, integer *incx) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static doublereal temp; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, jx, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical nounit; - - -/* Purpose - ======= - - DTRSV solves one of the systems of equations - - A*x = b, or A'*x = b, - - where b and x are n element vectors and A is an n by n unit, or - non-unit, upper or lower triangular matrix. - - No test for singularity or near-singularity is included in this - routine. Such tests must be performed before calling this routine. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the matrix is an upper or - lower triangular matrix as follows: - - UPLO = 'U' or 'u' A is an upper triangular matrix. - - UPLO = 'L' or 'l' A is a lower triangular matrix. - - Unchanged on exit. - - TRANS - CHARACTER*1. - On entry, TRANS specifies the equations to be solved as - follows: - - TRANS = 'N' or 'n' A*x = b. - - TRANS = 'T' or 't' A'*x = b. - - TRANS = 'C' or 'c' A'*x = b. - - Unchanged on exit. - - DIAG - CHARACTER*1. - On entry, DIAG specifies whether or not A is unit - triangular as follows: - - DIAG = 'U' or 'u' A is assumed to be unit triangular. - - DIAG = 'N' or 'n' A is not assumed to be unit - triangular. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular matrix and the strictly lower triangular part of - - A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular matrix and the strictly upper triangular part of - - A is not referenced. - Note that when DIAG = 'U' or 'u', the diagonal elements of - - A are not referenced either, but are assumed to be unity. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - DOUBLE PRECISION array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element right-hand side vector b. On exit, X is overwritten - - with the solution vector x. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && - ! lsame_(trans, "C")) { - info = 2; - } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { - info = 3; - } else if (*n < 0) { - info = 4; - } else if (*lda < max(1,*n)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } - if (info != 0) { - xerbla_("DTRSV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0) { - return 0; - } - - nounit = lsame_(diag, "N"); - -/* Set up the start point in X if the increment is not unity. This - will be ( N - 1 )*INCX too small for descending loops. */ - - if (*incx <= 0) { - kx = 1 - (*n - 1) * *incx; - } else if (*incx != 1) { - kx = 1; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (lsame_(trans, "N")) { - -/* Form x := inv( A )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - if (X(j) != 0.) { - if (nounit) { - X(j) /= A(j,j); - } - temp = X(j); - for (i = j - 1; i >= 1; --i) { - X(i) -= temp * A(i,j); -/* L10: */ - } - } -/* L20: */ - } - } else { - jx = kx + (*n - 1) * *incx; - for (j = *n; j >= 1; --j) { - if (X(jx) != 0.) { - if (nounit) { - X(jx) /= A(j,j); - } - temp = X(jx); - ix = jx; - for (i = j - 1; i >= 1; --i) { - ix -= *incx; - X(ix) -= temp * A(i,j); -/* L30: */ - } - } - jx -= *incx; -/* L40: */ - } - } - } else { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(j) != 0.) { - if (nounit) { - X(j) /= A(j,j); - } - temp = X(j); - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - X(i) -= temp * A(i,j); -/* L50: */ - } - } -/* L60: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.) { - if (nounit) { - X(jx) /= A(j,j); - } - temp = X(jx); - ix = jx; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - X(ix) -= temp * A(i,j); -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } - } else { - -/* Form x := inv( A' )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = X(j); - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - temp -= A(i,j) * X(i); -/* L90: */ - } - if (nounit) { - temp /= A(j,j); - } - X(j) = temp; -/* L100: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = X(jx); - ix = kx; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - temp -= A(i,j) * X(ix); - ix += *incx; -/* L110: */ - } - if (nounit) { - temp /= A(j,j); - } - X(jx) = temp; - jx += *incx; -/* L120: */ - } - } - } else { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - temp = X(j); - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - temp -= A(i,j) * X(i); -/* L130: */ - } - if (nounit) { - temp /= A(j,j); - } - X(j) = temp; -/* L140: */ - } - } else { - kx += (*n - 1) * *incx; - jx = kx; - for (j = *n; j >= 1; --j) { - temp = X(jx); - ix = kx; - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - temp -= A(i,j) * X(ix); - ix -= *incx; -/* L150: */ - } - if (nounit) { - temp /= A(j,j); - } - X(jx) = temp; - jx -= *incx; -/* L160: */ - } - } - } - } - - return 0; - -/* End of DTRSV . */ - -} /* dtrsv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dzasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dzasum.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dzasum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dzasum.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,68 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -doublereal dzasum_(integer *n, doublecomplex *zx, integer *incx) -{ - - - /* System generated locals */ - integer i__1; - doublereal ret_val; - - /* Local variables */ - static integer i; - static doublereal stemp; - extern doublereal dcabs1_(doublecomplex *); - static integer ix; - - -/* takes the sum of the absolute values. - jack dongarra, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define ZX(I) zx[(I)-1] - - - ret_val = 0.; - stemp = 0.; - if (*n <= 0 || *incx <= 0) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - ix = 1; - i__1 = *n; - for (i = 1; i <= *n; ++i) { - stemp += dcabs1_(&ZX(ix)); - ix += *incx; -/* L10: */ - } - ret_val = stemp; - return ret_val; - -/* code for increment equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - stemp += dcabs1_(&ZX(i)); -/* L30: */ - } - ret_val = stemp; - return ret_val; -} /* dzasum_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,96 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -doublereal dznrm2_(integer *n, doublecomplex *x, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3; - doublereal ret_val, d__1; - - /* Builtin functions */ - double d_imag(doublecomplex *), sqrt(doublereal); - - /* Local variables */ - static doublereal temp, norm, scale; - static integer ix; - static doublereal ssq; - - -/* DZNRM2 returns the euclidean norm of a vector via the function - name, so that - - DZNRM2 := sqrt( conjg( x' )*x ) - - - - -- This version written on 25-October-1982. - Modified on 14-October-1993 to inline the call to ZLASSQ. - Sven Hammarling, Nag Ltd. - - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - - - if (*n < 1 || *incx < 1) { - norm = 0.; - } else { - scale = 0.; - ssq = 1.; -/* The following loop is equivalent to this call to the LAPACK - - auxiliary routine: - CALL ZLASSQ( N, X, INCX, SCALE, SSQ ) */ - - i__1 = (*n - 1) * *incx + 1; - i__2 = *incx; - for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) { - i__3 = ix; - if (X(ix).r != 0.) { - i__3 = ix; - temp = (d__1 = X(ix).r, abs(d__1)); - if (scale < temp) { -/* Computing 2nd power */ - d__1 = scale / temp; - ssq = ssq * (d__1 * d__1) + 1.; - scale = temp; - } else { -/* Computing 2nd power */ - d__1 = temp / scale; - ssq += d__1 * d__1; - } - } - if (d_imag(&X(ix)) != 0.) { - temp = (d__1 = d_imag(&X(ix)), abs(d__1)); - if (scale < temp) { -/* Computing 2nd power */ - d__1 = scale / temp; - ssq = ssq * (d__1 * d__1) + 1.; - scale = temp; - } else { -/* Computing 2nd power */ - d__1 = temp / scale; - ssq += d__1 * d__1; - } - } -/* L10: */ - } - norm = scale * sqrt(ssq); - } - - ret_val = norm; - return ret_val; - -/* End of DZNRM2. */ - -} /* dznrm2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/f2c.h hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/f2c.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/f2c.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/f2c.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,43 +0,0 @@ -/* f2c.h -- Standard Fortran to C header file */ - -/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." - - - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ - -#include "Cnames.h" - -#ifndef F2C_INCLUDE -#define F2C_INCLUDE - -typedef int integer; -typedef int logical; - -typedef char *address; -typedef short int shortint; -typedef float real; -typedef double doublereal; -typedef struct { real r, i; } complex; -typedef struct { doublereal r, i; } doublecomplex; -typedef short int shortlogical; -typedef char logical1; -typedef char integer1; -/* typedef long long longint; */ /* system-dependent */ - -#define TRUE_ (1) -#define FALSE_ (0) - -/* Extern is for use with -E */ -#ifndef Extern -#define Extern extern -#endif - -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define dabs(x) (doublereal)abs(x) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) -#define dmin(a,b) (doublereal)min(a,b) -#define dmax(a,b) (doublereal)max(a,b) - -#define VOID void - -#endif diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/icamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/icamax.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/icamax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/icamax.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -#include "f2c.h" - -integer icamax_(integer *n, complex *cx, integer *incx) -{ - /* System generated locals */ - integer ret_val, i__1, i__2; - real r__1, r__2; - /* Builtin functions */ - double r_imag(complex *); - /* Local variables */ - static real smax; - static integer i, ix; -/* finds the index of element having max. absolute value. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - Parameter adjustments - Function Body */ -#define CX(I) cx[(I)-1] - ret_val = 0; - if (*n < 1 || *incx <= 0) { - return ret_val; - } - ret_val = 1; - if (*n == 1) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } -/* code for increment not equal to 1 */ - ix = 1; - smax = (r__1 = CX(1).r, dabs(r__1)) + (r__2 = r_imag(&CX(1)), dabs(r__2)); - ix += *incx; - i__1 = *n; - for (i = 2; i <= *n; ++i) { - i__2 = ix; - if ((r__1 = CX(ix).r, dabs(r__1)) + (r__2 = r_imag(&CX(ix)), dabs( - r__2)) <= smax) { - goto L5; - } - ret_val = i; - i__2 = ix; - smax = (r__1 = CX(ix).r, dabs(r__1)) + (r__2 = r_imag(&CX(ix)), - dabs(r__2)); -L5: - ix += *incx; -/* L10: */ - } - return ret_val; -/* code for increment equal to 1 */ -L20: - smax = (r__1 = CX(1).r, dabs(r__1)) + (r__2 = r_imag(&CX(1)), dabs(r__2)); - i__1 = *n; - for (i = 2; i <= *n; ++i) { - i__2 = i; - if ((r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(i)), dabs( - r__2)) <= smax) { - goto L30; - } - ret_val = i; - i__2 = i; - smax = (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(i)), dabs( - r__2)); -L30: - ; - } - return ret_val; -} /* icamax_ */ - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/idamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/idamax.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/idamax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/idamax.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,80 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -integer idamax_(integer *n, doublereal *dx, integer *incx) -{ - - - /* System generated locals */ - integer ret_val, i__1; - doublereal d__1; - - /* Local variables */ - static doublereal dmax__; - static integer i, ix; - - -/* finds the index of element having max. absolute value. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define DX(I) dx[(I)-1] - - - ret_val = 0; - if (*n < 1 || *incx <= 0) { - return ret_val; - } - ret_val = 1; - if (*n == 1) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - ix = 1; - dmax__ = abs(DX(1)); - ix += *incx; - i__1 = *n; - for (i = 2; i <= *n; ++i) { - if ((d__1 = DX(ix), abs(d__1)) <= dmax__) { - goto L5; - } - ret_val = i; - dmax__ = (d__1 = DX(ix), abs(d__1)); -L5: - ix += *incx; -/* L10: */ - } - return ret_val; - -/* code for increment equal to 1 */ - -L20: - dmax__ = abs(DX(1)); - i__1 = *n; - for (i = 2; i <= *n; ++i) { - if ((d__1 = DX(i), abs(d__1)) <= dmax__) { - goto L30; - } - ret_val = i; - dmax__ = (d__1 = DX(i), abs(d__1)); -L30: - ; - } - return ret_val; -} /* idamax_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/isamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/isamax.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/isamax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/isamax.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,80 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -integer isamax_(integer *n, real *sx, integer *incx) -{ - - - /* System generated locals */ - integer ret_val, i__1; - real r__1; - - /* Local variables */ - static real smax; - static integer i, ix; - - -/* finds the index of element having max. absolute value. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SX(I) sx[(I)-1] - - - ret_val = 0; - if (*n < 1 || *incx <= 0) { - return ret_val; - } - ret_val = 1; - if (*n == 1) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - ix = 1; - smax = dabs(SX(1)); - ix += *incx; - i__1 = *n; - for (i = 2; i <= *n; ++i) { - if ((r__1 = SX(ix), dabs(r__1)) <= smax) { - goto L5; - } - ret_val = i; - smax = (r__1 = SX(ix), dabs(r__1)); -L5: - ix += *incx; -/* L10: */ - } - return ret_val; - -/* code for increment equal to 1 */ - -L20: - smax = dabs(SX(1)); - i__1 = *n; - for (i = 2; i <= *n; ++i) { - if ((r__1 = SX(i), dabs(r__1)) <= smax) { - goto L30; - } - ret_val = i; - smax = (r__1 = SX(i), dabs(r__1)); -L30: - ; - } - return ret_val; -} /* isamax_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/izamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/izamax.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/izamax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/izamax.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,81 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -integer izamax_(integer *n, doublecomplex *zx, integer *incx) -{ - - - /* System generated locals */ - integer ret_val, i__1; - - /* Local variables */ - static doublereal smax; - static integer i; - extern doublereal dcabs1_(doublecomplex *); - static integer ix; - - -/* finds the index of element having max. absolute value. - jack dongarra, 1/15/85. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define ZX(I) zx[(I)-1] - - - ret_val = 0; - if (*n < 1 || *incx <= 0) { - return ret_val; - } - ret_val = 1; - if (*n == 1) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - ix = 1; - smax = dcabs1_(&ZX(1)); - ix += *incx; - i__1 = *n; - for (i = 2; i <= *n; ++i) { - if (dcabs1_(&ZX(ix)) <= smax) { - goto L5; - } - ret_val = i; - smax = dcabs1_(&ZX(ix)); -L5: - ix += *incx; -/* L10: */ - } - return ret_val; - -/* code for increment equal to 1 */ - -L20: - smax = dcabs1_(&ZX(1)); - i__1 = *n; - for (i = 2; i <= *n; ++i) { - if (dcabs1_(&ZX(i)) <= smax) { - goto L30; - } - ret_val = i; - smax = dcabs1_(&ZX(i)); -L30: - ; - } - return ret_val; -} /* izamax_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Makefile --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,92 +0,0 @@ -include ../make.inc -HEADER = ../SRC - -####################################################################### -# This is the makefile to create a library for C-BLAS. -# The files are organized as follows: -# -# SBLAS1 -- Single precision real BLAS routines -# CBLAS1 -- Single precision complex BLAS routines -# DBLAS1 -- Double precision real BLAS routines -# ZBLAS1 -- Double precision complex BLAS routines -# -# CB1AUX -- Real BLAS routines called by complex routines -# ZB1AUX -- D.P. real BLAS routines called by d.p. complex -# routines -# -# ALLBLAS -- Auxiliary routines for Level 2 and 3 BLAS -# -# SBLAS2 -- Single precision real BLAS2 routines -# CBLAS2 -- Single precision complex BLAS2 routines -# DBLAS2 -- Double precision real BLAS2 routines -# ZBLAS2 -- Double precision complex BLAS2 routines -# -# SBLAS3 -- Single precision real BLAS3 routines -# CBLAS3 -- Single precision complex BLAS3 routines -# DBLAS3 -- Double precision real BLAS3 routines -# ZBLAS3 -- Double precision complex BLAS3 routines -# -# The library can be set up to include routines for any combination -# of the four precisions. To create or add to the library, enter make -# followed by one or more of the precisions desired. Some examples: -# make single -# make single complex -# make single double complex complex16 -# Alternatively, the command -# make -# without any arguments creates a library of all four precisions. -# The library is called -# blas.a -# and is created at the next higher directory level. -# -# To remove the object files after the library is created, enter -# make clean -# -####################################################################### - -SBLAS1 = isamax.o sasum.o saxpy.o scopy.o sdot.o snrm2.o \ - srot.o sscal.o -SBLAS2 = sgemv.o ssymv.o strsv.o sger.o ssyr2.o - -DBLAS1 = idamax.o dasum.o daxpy.o dcopy.o ddot.o dnrm2.o \ - drot.o dscal.o -DBLAS2 = dgemv.o dsymv.o dtrsv.o dger.o dsyr2.o -DBLAS3 = dgemm.o dtrsm.o - -CBLAS1 = icamax.o scasum.o caxpy.o ccopy.o scnrm2.o \ - cscal.o -CBLAS2 = cgemv.o chemv.o ctrsv.o cgerc.o cgeru.o cher2.o - -ZBLAS1 = izamax.o dzasum.o zaxpy.o zcopy.o dznrm2.o \ - zscal.o dcabs1.o -ZBLAS2 = zgemv.o zhemv.o ztrsv.o zgerc.o zgeru.o zher2.o -ZBLAS3 = zgemm.o ztrsm.o - - -all: single double complex complex16 - -single: $(SBLAS1) $(SBLAS2) $(SBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \ - $(SBLAS2) $(SBLAS3) - $(RANLIB) $(BLASLIB) - -double: $(DBLAS1) $(DBLAS2) $(DBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \ - $(DBLAS2) $(DBLAS3) - $(RANLIB) $(BLASLIB) - -complex: $(CBLAS1) $(CBLAS2) $(CBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(ALLBLAS) \ - $(CBLAS2) $(CBLAS3) - $(RANLIB) $(BLASLIB) - -complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ALLBLAS) \ - $(ZBLAS2) $(ZBLAS3) - $(RANLIB) $(BLASLIB) - -.c.o: - $(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE) - -clean: - rm -f *.o diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sasum.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sasum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sasum.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,89 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -real sasum_(integer *n, real *sx, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2; - real ret_val, r__1, r__2, r__3, r__4, r__5, r__6; - - /* Local variables */ - static integer i, m, nincx; - static real stemp; - static integer mp1; - - -/* takes the sum of the absolute values. - uses unrolled loops for increment equal to one. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SX(I) sx[(I)-1] - - - ret_val = 0.f; - stemp = 0.f; - if (*n <= 0 || *incx <= 0) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - stemp += (r__1 = SX(i), dabs(r__1)); -/* L10: */ - } - ret_val = stemp; - return ret_val; - -/* code for increment equal to 1 - - - clean-up loop */ - -L20: - m = *n % 6; - if (m == 0) { - goto L40; - } - i__2 = m; - for (i = 1; i <= m; ++i) { - stemp += (r__1 = SX(i), dabs(r__1)); -/* L30: */ - } - if (*n < 6) { - goto L60; - } -L40: - mp1 = m + 1; - i__2 = *n; - for (i = mp1; i <= *n; i += 6) { - stemp = stemp + (r__1 = SX(i), dabs(r__1)) + (r__2 = SX(i + 1), dabs( - r__2)) + (r__3 = SX(i + 2), dabs(r__3)) + (r__4 = SX(i + 3), - dabs(r__4)) + (r__5 = SX(i + 4), dabs(r__5)) + (r__6 = SX(i + - 5), dabs(r__6)); -/* L50: */ - } -L60: - ret_val = stemp; - return ret_val; -} /* sasum_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/saxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/saxpy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/saxpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/saxpy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int saxpy_(integer *n, real *sa, real *sx, integer *incx, - real *sy, integer *incy) -{ - - - /* System generated locals */ - integer i__1; - - /* Local variables */ - static integer i, m, ix, iy, mp1; - - -/* constant times a vector plus a vector. - uses unrolled loop for increments equal to one. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SY(I) sy[(I)-1] -#define SX(I) sx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*sa == 0.f) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - SY(iy) += *sa * SX(ix); - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 - - - clean-up loop */ - -L20: - m = *n % 4; - if (m == 0) { - goto L40; - } - i__1 = m; - for (i = 1; i <= m; ++i) { - SY(i) += *sa * SX(i); -/* L30: */ - } - if (*n < 4) { - return 0; - } -L40: - mp1 = m + 1; - i__1 = *n; - for (i = mp1; i <= *n; i += 4) { - SY(i) += *sa * SX(i); - SY(i + 1) += *sa * SX(i + 1); - SY(i + 2) += *sa * SX(i + 2); - SY(i + 3) += *sa * SX(i + 3); -/* L50: */ - } - return 0; -} /* saxpy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scasum.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scasum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scasum.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,74 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -real scasum_(integer *n, complex *cx, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3; - real ret_val, r__1, r__2; - - /* Builtin functions */ - double r_imag(complex *); - - /* Local variables */ - static integer i, nincx; - static real stemp; - - -/* takes the sum of the absolute values of a complex vector and - returns a single precision result. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define CX(I) cx[(I)-1] - - - ret_val = 0.f; - stemp = 0.f; - if (*n <= 0 || *incx <= 0) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - i__3 = i; - stemp = stemp + (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX( - i)), dabs(r__2)); -/* L10: */ - } - ret_val = stemp; - return ret_val; - -/* code for increment equal to 1 */ - -L20: - i__2 = *n; - for (i = 1; i <= *n; ++i) { - i__1 = i; - stemp = stemp + (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX( - i)), dabs(r__2)); -/* L30: */ - } - ret_val = stemp; - return ret_val; -} /* scasum_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,96 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -real scnrm2_(integer *n, complex *x, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3; - real ret_val, r__1; - - /* Builtin functions */ - double r_imag(complex *), sqrt(doublereal); - - /* Local variables */ - static real temp, norm, scale; - static integer ix; - static real ssq; - - -/* SCNRM2 returns the euclidean norm of a vector via the function - name, so that - - SCNRM2 := sqrt( conjg( x' )*x ) - - - - -- This version written on 25-October-1982. - Modified on 14-October-1993 to inline the call to CLASSQ. - Sven Hammarling, Nag Ltd. - - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - - - if (*n < 1 || *incx < 1) { - norm = 0.f; - } else { - scale = 0.f; - ssq = 1.f; -/* The following loop is equivalent to this call to the LAPACK - - auxiliary routine: - CALL CLASSQ( N, X, INCX, SCALE, SSQ ) */ - - i__1 = (*n - 1) * *incx + 1; - i__2 = *incx; - for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) { - i__3 = ix; - if (X(ix).r != 0.f) { - i__3 = ix; - temp = (r__1 = X(ix).r, dabs(r__1)); - if (scale < temp) { -/* Computing 2nd power */ - r__1 = scale / temp; - ssq = ssq * (r__1 * r__1) + 1.f; - scale = temp; - } else { -/* Computing 2nd power */ - r__1 = temp / scale; - ssq += r__1 * r__1; - } - } - if (r_imag(&X(ix)) != 0.f) { - temp = (r__1 = r_imag(&X(ix)), dabs(r__1)); - if (scale < temp) { -/* Computing 2nd power */ - r__1 = scale / temp; - ssq = ssq * (r__1 * r__1) + 1.f; - scale = temp; - } else { -/* Computing 2nd power */ - r__1 = temp / scale; - ssq += r__1 * r__1; - } - } -/* L10: */ - } - norm = scale * sqrt(ssq); - } - - ret_val = norm; - return ret_val; - -/* End of SCNRM2. */ - -} /* scnrm2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scopy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scopy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scopy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int scopy_(integer *n, real *sx, integer *incx, real *sy, - integer *incy) -{ - - - /* System generated locals */ - integer i__1; - - /* Local variables */ - static integer i, m, ix, iy, mp1; - - -/* copies a vector, x, to a vector, y. - uses unrolled loops for increments equal to 1. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SY(I) sy[(I)-1] -#define SX(I) sx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - SY(iy) = SX(ix); - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 - - - clean-up loop */ - -L20: - m = *n % 7; - if (m == 0) { - goto L40; - } - i__1 = m; - for (i = 1; i <= m; ++i) { - SY(i) = SX(i); -/* L30: */ - } - if (*n < 7) { - return 0; - } -L40: - mp1 = m + 1; - i__1 = *n; - for (i = mp1; i <= *n; i += 7) { - SY(i) = SX(i); - SY(i + 1) = SX(i + 1); - SY(i + 2) = SX(i + 2); - SY(i + 3) = SX(i + 3); - SY(i + 4) = SX(i + 4); - SY(i + 5) = SX(i + 5); - SY(i + 6) = SX(i + 6); -/* L50: */ - } - return 0; -} /* scopy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sdot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sdot.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sdot.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sdot.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,96 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -real sdot_(integer *n, real *sx, integer *incx, real *sy, integer *incy) -{ - - - /* System generated locals */ - integer i__1; - real ret_val; - - /* Local variables */ - static integer i, m; - static real stemp; - static integer ix, iy, mp1; - - -/* forms the dot product of two vectors. - uses unrolled loops for increments equal to one. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SY(I) sy[(I)-1] -#define SX(I) sx[(I)-1] - - - stemp = 0.f; - ret_val = 0.f; - if (*n <= 0) { - return ret_val; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - stemp += SX(ix) * SY(iy); - ix += *incx; - iy += *incy; -/* L10: */ - } - ret_val = stemp; - return ret_val; - -/* code for both increments equal to 1 - - - clean-up loop */ - -L20: - m = *n % 5; - if (m == 0) { - goto L40; - } - i__1 = m; - for (i = 1; i <= m; ++i) { - stemp += SX(i) * SY(i); -/* L30: */ - } - if (*n < 5) { - goto L60; - } -L40: - mp1 = m + 1; - i__1 = *n; - for (i = mp1; i <= *n; i += 5) { - stemp = stemp + SX(i) * SY(i) + SX(i + 1) * SY(i + 1) + SX(i + 2) * - SY(i + 2) + SX(i + 3) * SY(i + 3) + SX(i + 4) * SY(i + 4); -/* L50: */ - } -L60: - ret_val = stemp; - return ret_val; -} /* sdot_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sgemv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sgemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sgemv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,299 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int sgemv_(char *trans, integer *m, integer *n, real *alpha, - real *a, integer *lda, real *x, integer *incx, real *beta, real *y, - integer *incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static real temp; - static integer lenx, leny, i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - SGEMV performs one of the matrix-vector operations - - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - - where alpha and beta are scalars, x and y are vectors and A is an - m by n matrix. - - Parameters - ========== - - TRANS - CHARACTER*1. - On entry, TRANS specifies the operation to be performed as - follows: - - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - REAL . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - REAL array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - X - REAL array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - REAL . - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - REAL array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - - updated vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! - lsame_(trans, "C")) { - info = 1; - } else if (*m < 0) { - info = 2; - } else if (*n < 0) { - info = 3; - } else if (*lda < max(1,*m)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } else if (*incy == 0) { - info = 11; - } - if (info != 0) { - xerbla_("SGEMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; - } - -/* Set LENX and LENY, the lengths of the vectors x and y, and set - - up the start points in X and Y. */ - - if (lsame_(trans, "N")) { - lenx = *n; - leny = *m; - } else { - lenx = *m; - leny = *n; - } - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (lenx - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (leny - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. - - First form y := beta*y. */ - - if (*beta != 1.f) { - if (*incy == 1) { - if (*beta == 0.f) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(i) = 0.f; -/* L10: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(i) = *beta * Y(i); -/* L20: */ - } - } - } else { - iy = ky; - if (*beta == 0.f) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(iy) = 0.f; - iy += *incy; -/* L30: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - Y(iy) = *beta * Y(iy); - iy += *incy; -/* L40: */ - } - } - } - } - if (*alpha == 0.f) { - return 0; - } - if (lsame_(trans, "N")) { - -/* Form y := alpha*A*x + y. */ - - jx = kx; - if (*incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.f) { - temp = *alpha * X(jx); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - Y(i) += temp * A(i,j); -/* L50: */ - } - } - jx += *incx; -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.f) { - temp = *alpha * X(jx); - iy = ky; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - Y(iy) += temp * A(i,j); - iy += *incy; -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } else { - -/* Form y := alpha*A'*x + y. */ - - jy = ky; - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = 0.f; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp += A(i,j) * X(i); -/* L90: */ - } - Y(jy) += *alpha * temp; - jy += *incy; -/* L100: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = 0.f; - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp += A(i,j) * X(ix); - ix += *incx; -/* L110: */ - } - Y(jy) += *alpha * temp; - jy += *incy; -/* L120: */ - } - } - } - - return 0; - -/* End of SGEMV . */ - -} /* sgemv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sger.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sger.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sger.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sger.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,181 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int sger_(integer *m, integer *n, real *alpha, real *x, - integer *incx, real *y, integer *incy, real *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static real temp; - static integer i, j, ix, jy, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - SGER performs the rank 1 operation - - A := alpha*x*y' + A, - - where alpha is a scalar, x is an m element vector, y is an n element - - vector and A is an m by n matrix. - - Parameters - ========== - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - REAL . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - REAL array of dimension at least - ( 1 + ( m - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the m - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - REAL array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - REAL array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. On exit, A is - overwritten by the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (*m < 0) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*m)) { - info = 9; - } - if (info != 0) { - xerbla_("SGER ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || *alpha == 0.f) { - return 0; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (*incy > 0) { - jy = 1; - } else { - jy = 1 - (*n - 1) * *incy; - } - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (Y(jy) != 0.f) { - temp = *alpha * Y(jy); - i__2 = *m; - for (i = 1; i <= *m; ++i) { - A(i,j) += X(i) * temp; -/* L10: */ - } - } - jy += *incy; -/* L20: */ - } - } else { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*m - 1) * *incx; - } - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (Y(jy) != 0.f) { - temp = *alpha * Y(jy); - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - A(i,j) += X(ix) * temp; - ix += *incx; -/* L30: */ - } - } - jy += *incy; -/* L40: */ - } - } - - return 0; - -/* End of SGER . */ - -} /* sger_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/snrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/snrm2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/snrm2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/snrm2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,83 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -real snrm2_(integer *n, real *x, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2; - real ret_val, r__1; - - /* Builtin functions */ - double sqrt(doublereal); - - /* Local variables */ - static real norm, scale, absxi; - static integer ix; - static real ssq; - - -/* SNRM2 returns the euclidean norm of a vector via the function - name, so that - - SNRM2 := sqrt( x'*x ) - - - - -- This version written on 25-October-1982. - Modified on 14-October-1993 to inline the call to SLASSQ. - Sven Hammarling, Nag Ltd. - - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - - - if (*n < 1 || *incx < 1) { - norm = 0.f; - } else if (*n == 1) { - norm = dabs(X(1)); - } else { - scale = 0.f; - ssq = 1.f; -/* The following loop is equivalent to this call to the LAPACK - - auxiliary routine: - CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */ - - i__1 = (*n - 1) * *incx + 1; - i__2 = *incx; - for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) { - if (X(ix) != 0.f) { - absxi = (r__1 = X(ix), dabs(r__1)); - if (scale < absxi) { -/* Computing 2nd power */ - r__1 = scale / absxi; - ssq = ssq * (r__1 * r__1) + 1.f; - scale = absxi; - } else { -/* Computing 2nd power */ - r__1 = absxi / scale; - ssq += r__1 * r__1; - } - } -/* L10: */ - } - norm = scale * sqrt(ssq); - } - - ret_val = norm; - return ret_val; - -/* End of SNRM2. */ - -} /* snrm2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/srot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/srot.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/srot.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/srot.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,76 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int srot_(integer *n, real *sx, integer *incx, real *sy, - integer *incy, real *c, real *s) -{ - - - /* System generated locals */ - integer i__1; - - /* Local variables */ - static integer i; - static real stemp; - static integer ix, iy; - - -/* applies a plane rotation. - jack dongarra, linpack, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SY(I) sy[(I)-1] -#define SX(I) sx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments not equal - to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - stemp = *c * SX(ix) + *s * SY(iy); - SY(iy) = *c * SY(iy) - *s * SX(ix); - SX(ix) = stemp; - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - stemp = *c * SX(i) + *s * SY(i); - SY(i) = *c * SY(i) - *s * SX(i); - SX(i) = stemp; -/* L30: */ - } - return 0; -} /* srot_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sscal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sscal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sscal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,82 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int sscal_(integer *n, real *sa, real *sx, integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2; - - /* Local variables */ - static integer i, m, nincx, mp1; - - -/* scales a vector by a constant. - uses unrolled loops for increment equal to 1. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define SX(I) sx[(I)-1] - - - if (*n <= 0 || *incx <= 0) { - return 0; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - SX(i) = *sa * SX(i); -/* L10: */ - } - return 0; - -/* code for increment equal to 1 - - - clean-up loop */ - -L20: - m = *n % 5; - if (m == 0) { - goto L40; - } - i__2 = m; - for (i = 1; i <= m; ++i) { - SX(i) = *sa * SX(i); -/* L30: */ - } - if (*n < 5) { - return 0; - } -L40: - mp1 = m + 1; - i__2 = *n; - for (i = mp1; i <= *n; i += 5) { - SX(i) = *sa * SX(i); - SX(i + 1) = *sa * SX(i + 1); - SX(i + 2) = *sa * SX(i + 2); - SX(i + 3) = *sa * SX(i + 3); - SX(i + 4) = *sa * SX(i + 4); -/* L50: */ - } - return 0; -} /* sscal_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssymv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssymv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssymv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssymv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,300 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int ssymv_(char *uplo, integer *n, real *alpha, real *a, - integer *lda, real *x, integer *incx, real *beta, real *y, integer * - incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static real temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - SSYMV performs the matrix-vector operation - - y := alpha*A*x + beta*y, - - where alpha and beta are scalars, x and y are n element vectors and - A is an n by n symmetric matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - REAL . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - REAL array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the symmetric matrix and the strictly - lower triangular part of A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the symmetric matrix and the strictly - upper triangular part of A is not referenced. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - REAL array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - REAL . - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - REAL array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. On exit, Y is overwritten by the updated - vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*lda < max(1,*n)) { - info = 5; - } else if (*incx == 0) { - info = 7; - } else if (*incy == 0) { - info = 10; - } - if (info != 0) { - xerbla_("SSYMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; - } - -/* Set up the start points in X and Y. */ - - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. - - First form y := beta*y. */ - - if (*beta != 1.f) { - if (*incy == 1) { - if (*beta == 0.f) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(i) = 0.f; -/* L10: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(i) = *beta * Y(i); -/* L20: */ - } - } - } else { - iy = ky; - if (*beta == 0.f) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(iy) = 0.f; - iy += *incy; -/* L30: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - Y(iy) = *beta * Y(iy); - iy += *incy; -/* L40: */ - } - } - } - } - if (*alpha == 0.f) { - return 0; - } - if (lsame_(uplo, "U")) { - -/* Form y when A is stored in upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(j); - temp2 = 0.f; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - Y(i) += temp1 * A(i,j); - temp2 += A(i,j) * X(i); -/* L50: */ - } - Y(j) = Y(j) + temp1 * A(j,j) + *alpha * temp2; -/* L60: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(jx); - temp2 = 0.f; - ix = kx; - iy = ky; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - Y(iy) += temp1 * A(i,j); - temp2 += A(i,j) * X(ix); - ix += *incx; - iy += *incy; -/* L70: */ - } - Y(jy) = Y(jy) + temp1 * A(j,j) + *alpha * temp2; - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } else { - -/* Form y when A is stored in lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(j); - temp2 = 0.f; - Y(j) += temp1 * A(j,j); - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - Y(i) += temp1 * A(i,j); - temp2 += A(i,j) * X(i); -/* L90: */ - } - Y(j) += *alpha * temp2; -/* L100: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp1 = *alpha * X(jx); - temp2 = 0.f; - Y(jy) += temp1 * A(j,j); - ix = jx; - iy = jy; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - iy += *incy; - Y(iy) += temp1 * A(i,j); - temp2 += A(i,j) * X(ix); -/* L110: */ - } - Y(jy) += *alpha * temp2; - jx += *incx; - jy += *incy; -/* L120: */ - } - } - } - - return 0; - -/* End of SSYMV . */ - -} /* ssymv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,263 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int ssyr2_(char *uplo, integer *n, real *alpha, real *x, - integer *incx, real *y, integer *incy, real *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static real temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - SSYR2 performs the symmetric rank 2 operation - - A := alpha*x*y' + alpha*y*x' + A, - - where alpha is a scalar, x and y are n element vectors and A is an n - - by n symmetric matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - REAL . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - REAL array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - REAL array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - REAL array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the symmetric matrix and the strictly - lower triangular part of A is not referenced. On exit, the - upper triangular part of the array A is overwritten by the - upper triangular part of the updated matrix. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the symmetric matrix and the strictly - upper triangular part of A is not referenced. On exit, the - lower triangular part of the array A is overwritten by the - lower triangular part of the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*n)) { - info = 9; - } - if (info != 0) { - xerbla_("SSYR2 ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || *alpha == 0.f) { - return 0; - } - -/* Set up the start points in X and Y if the increments are not both - - unity. */ - - if (*incx != 1 || *incy != 1) { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - jx = kx; - jy = ky; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. */ - - if (lsame_(uplo, "U")) { - -/* Form A when A is stored in the upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(j) != 0.f || Y(j) != 0.f) { - temp1 = *alpha * Y(j); - temp2 = *alpha * X(j); - i__2 = j; - for (i = 1; i <= j; ++i) { - A(i,j) = A(i,j) + X(i) * temp1 - + Y(i) * temp2; -/* L10: */ - } - } -/* L20: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.f || Y(jy) != 0.f) { - temp1 = *alpha * Y(jy); - temp2 = *alpha * X(jx); - ix = kx; - iy = ky; - i__2 = j; - for (i = 1; i <= j; ++i) { - A(i,j) = A(i,j) + X(ix) * temp1 - + Y(iy) * temp2; - ix += *incx; - iy += *incy; -/* L30: */ - } - } - jx += *incx; - jy += *incy; -/* L40: */ - } - } - } else { - -/* Form A when A is stored in the lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(j) != 0.f || Y(j) != 0.f) { - temp1 = *alpha * Y(j); - temp2 = *alpha * X(j); - i__2 = *n; - for (i = j; i <= *n; ++i) { - A(i,j) = A(i,j) + X(i) * temp1 - + Y(i) * temp2; -/* L50: */ - } - } -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.f || Y(jy) != 0.f) { - temp1 = *alpha * Y(jy); - temp2 = *alpha * X(jx); - ix = jx; - iy = jy; - i__2 = *n; - for (i = j; i <= *n; ++i) { - A(i,j) = A(i,j) + X(ix) * temp1 - + Y(iy) * temp2; - ix += *incx; - iy += *incy; -/* L70: */ - } - } - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } - - return 0; - -/* End of SSYR2 . */ - -} /* ssyr2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/strsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/strsv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/strsv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/strsv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,338 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int strsv_(char *uplo, char *trans, char *diag, integer *n, - real *a, integer *lda, real *x, integer *incx) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2; - - /* Local variables */ - static integer info; - static real temp; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, jx, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical nounit; - - -/* Purpose - ======= - - STRSV solves one of the systems of equations - - A*x = b, or A'*x = b, - - where b and x are n element vectors and A is an n by n unit, or - non-unit, upper or lower triangular matrix. - - No test for singularity or near-singularity is included in this - routine. Such tests must be performed before calling this routine. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the matrix is an upper or - lower triangular matrix as follows: - - UPLO = 'U' or 'u' A is an upper triangular matrix. - - UPLO = 'L' or 'l' A is a lower triangular matrix. - - Unchanged on exit. - - TRANS - CHARACTER*1. - On entry, TRANS specifies the equations to be solved as - follows: - - TRANS = 'N' or 'n' A*x = b. - - TRANS = 'T' or 't' A'*x = b. - - TRANS = 'C' or 'c' A'*x = b. - - Unchanged on exit. - - DIAG - CHARACTER*1. - On entry, DIAG specifies whether or not A is unit - triangular as follows: - - DIAG = 'U' or 'u' A is assumed to be unit triangular. - - DIAG = 'N' or 'n' A is not assumed to be unit - triangular. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - A - REAL array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular matrix and the strictly lower triangular part of - - A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular matrix and the strictly upper triangular part of - - A is not referenced. - Note that when DIAG = 'U' or 'u', the diagonal elements of - - A are not referenced either, but are assumed to be unity. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - REAL array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element right-hand side vector b. On exit, X is overwritten - - with the solution vector x. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && - ! lsame_(trans, "C")) { - info = 2; - } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { - info = 3; - } else if (*n < 0) { - info = 4; - } else if (*lda < max(1,*n)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } - if (info != 0) { - xerbla_("STRSV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0) { - return 0; - } - - nounit = lsame_(diag, "N"); - -/* Set up the start point in X if the increment is not unity. This - will be ( N - 1 )*INCX too small for descending loops. */ - - if (*incx <= 0) { - kx = 1 - (*n - 1) * *incx; - } else if (*incx != 1) { - kx = 1; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (lsame_(trans, "N")) { - -/* Form x := inv( A )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - if (X(j) != 0.f) { - if (nounit) { - X(j) /= A(j,j); - } - temp = X(j); - for (i = j - 1; i >= 1; --i) { - X(i) -= temp * A(i,j); -/* L10: */ - } - } -/* L20: */ - } - } else { - jx = kx + (*n - 1) * *incx; - for (j = *n; j >= 1; --j) { - if (X(jx) != 0.f) { - if (nounit) { - X(jx) /= A(j,j); - } - temp = X(jx); - ix = jx; - for (i = j - 1; i >= 1; --i) { - ix -= *incx; - X(ix) -= temp * A(i,j); -/* L30: */ - } - } - jx -= *incx; -/* L40: */ - } - } - } else { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(j) != 0.f) { - if (nounit) { - X(j) /= A(j,j); - } - temp = X(j); - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - X(i) -= temp * A(i,j); -/* L50: */ - } - } -/* L60: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (X(jx) != 0.f) { - if (nounit) { - X(jx) /= A(j,j); - } - temp = X(jx); - ix = jx; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - X(ix) -= temp * A(i,j); -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } - } else { - -/* Form x := inv( A' )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = X(j); - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - temp -= A(i,j) * X(i); -/* L90: */ - } - if (nounit) { - temp /= A(j,j); - } - X(j) = temp; -/* L100: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp = X(jx); - ix = kx; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - temp -= A(i,j) * X(ix); - ix += *incx; -/* L110: */ - } - if (nounit) { - temp /= A(j,j); - } - X(jx) = temp; - jx += *incx; -/* L120: */ - } - } - } else { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - temp = X(j); - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - temp -= A(i,j) * X(i); -/* L130: */ - } - if (nounit) { - temp /= A(j,j); - } - X(j) = temp; -/* L140: */ - } - } else { - kx += (*n - 1) * *incx; - jx = kx; - for (j = *n; j >= 1; --j) { - temp = X(jx); - ix = kx; - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - temp -= A(i,j) * X(ix); - ix -= *incx; -/* L150: */ - } - if (nounit) { - temp /= A(j,j); - } - X(jx) = temp; - jx -= *incx; -/* L160: */ - } - } - } - } - - return 0; - -/* End of STRSV . */ - -} /* strsv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,43 +0,0 @@ -/* f2c.h -- Standard Fortran to C header file */ - -/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." - - - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ - -#include "Cnames.h" - -#ifndef F2C_INCLUDE -#define F2C_INCLUDE - -typedef int integer; -typedef int logical; - -typedef char *address; -typedef short int shortint; -typedef float real; -typedef double doublereal; -typedef struct { real r, i; } complex; -typedef struct { doublereal r, i; } doublecomplex; -typedef short int shortlogical; -typedef char logical1; -typedef char integer1; -/* typedef long long longint; */ /* system-dependent */ - -#define TRUE_ (1) -#define FALSE_ (0) - -/* Extern is for use with -E */ -#ifndef Extern -#define Extern extern -#endif - -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define dabs(x) (doublereal)abs(x) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) -#define dmin(a,b) (doublereal)min(a,b) -#define dmax(a,b) (doublereal)max(a,b) - -#define VOID void - -#endif diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,87 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zaxpy_(integer *n, doublecomplex *za, doublecomplex *zx, - integer *incx, doublecomplex *zy, integer *incy) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3, i__4; - doublecomplex z__1, z__2; - - /* Local variables */ - static integer i; - extern doublereal dcabs1_(doublecomplex *); - static integer ix, iy; - - -/* constant times a vector plus a vector. - jack dongarra, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - Parameter adjustments - Function Body */ -#define ZY(I) zy[(I)-1] -#define ZX(I) zx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (dcabs1_(za) == 0.) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - i__3 = iy; - i__4 = ix; - z__2.r = za->r * ZX(ix).r - za->i * ZX(ix).i, z__2.i = za->r * ZX( - ix).i + za->i * ZX(ix).r; - z__1.r = ZY(iy).r + z__2.r, z__1.i = ZY(iy).i + z__2.i; - ZY(iy).r = z__1.r, ZY(iy).i = z__1.i; - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - i__4 = i; - z__2.r = za->r * ZX(i).r - za->i * ZX(i).i, z__2.i = za->r * ZX( - i).i + za->i * ZX(i).r; - z__1.r = ZY(i).r + z__2.r, z__1.i = ZY(i).i + z__2.i; - ZY(i).r = z__1.r, ZY(i).i = z__1.i; -/* L30: */ - } - return 0; -} /* zaxpy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zcopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zcopy.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zcopy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zcopy.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,74 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zcopy_(integer *n, doublecomplex *zx, integer *incx, - doublecomplex *zy, integer *incy) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3; - - /* Local variables */ - static integer i, ix, iy; - - -/* copies a vector, x, to a vector, y. - jack dongarra, linpack, 4/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define ZY(I) zy[(I)-1] -#define ZX(I) zx[(I)-1] - - - if (*n <= 0) { - return 0; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - i__3 = ix; - ZY(iy).r = ZX(ix).r, ZY(iy).i = ZX(ix).i; - ix += *incx; - iy += *incy; -/* L10: */ - } - return 0; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - ZY(i).r = ZX(i).r, ZY(i).i = ZX(i).i; -/* L30: */ - } - return 0; -} /* zcopy_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zdotc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zdotc.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zdotc.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zdotc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,85 +0,0 @@ -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Double Complex */ VOID zdotc_(doublecomplex * ret_val, integer *n, - doublecomplex *zx, integer *incx, doublecomplex *zy, integer *incy) -{ - /* System generated locals */ - integer i__1, i__2; - doublecomplex z__1, z__2, z__3; - - /* Builtin functions */ - void d_cnjg(doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer i; - static doublecomplex ztemp; - static integer ix, iy; - - -/* forms the dot product of a vector. - jack dongarra, 3/11/78. - modified 12/3/93, array(1) declarations changed to array(*) - - - Parameter adjustments */ - --zy; - --zx; - - /* Function Body */ - ztemp.r = 0., ztemp.i = 0.; - ret_val->r = 0., ret_val->i = 0.; - if (*n <= 0) { - return ; - } - if (*incx == 1 && *incy == 1) { - goto L20; - } - -/* code for unequal increments or equal increments - not equal to 1 */ - - ix = 1; - iy = 1; - if (*incx < 0) { - ix = (-(*n) + 1) * *incx + 1; - } - if (*incy < 0) { - iy = (-(*n) + 1) * *incy + 1; - } - i__1 = *n; - for (i = 1; i <= *n; ++i) { - d_cnjg(&z__3, &zx[ix]); - i__2 = iy; - z__2.r = z__3.r * zy[iy].r - z__3.i * zy[iy].i, z__2.i = z__3.r * - zy[iy].i + z__3.i * zy[iy].r; - z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i; - ztemp.r = z__1.r, ztemp.i = z__1.i; - ix += *incx; - iy += *incy; -/* L10: */ - } - ret_val->r = ztemp.r, ret_val->i = ztemp.i; - return ; - -/* code for both increments equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - d_cnjg(&z__3, &zx[i]); - i__2 = i; - z__2.r = z__3.r * zy[i].r - z__3.i * zy[i].i, z__2.i = z__3.r * - zy[i].i + z__3.i * zy[i].r; - z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i; - ztemp.r = z__1.r, ztemp.i = z__1.i; -/* L30: */ - } - ret_val->r = ztemp.r, ret_val->i = ztemp.i; - return ; -} /* zdotc_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,697 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zgemm_(char *transa, char *transb, integer *m, integer * - n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda, - doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *c, - integer *ldc) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, - i__3, i__4, i__5, i__6; - doublecomplex z__1, z__2, z__3, z__4; - - /* Builtin functions */ - void d_cnjg(doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static logical nota, notb; - static doublecomplex temp; - static integer i, j, l; - static logical conja, conjb; - static integer ncola; - extern logical lsame_(char *, char *); - static integer nrowa, nrowb; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - ZGEMM performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - CHARACTER*1. - On entry, TRANSA specifies the form of op( A ) to be used in - - the matrix multiplication as follows: - - TRANSA = 'N' or 'n', op( A ) = A. - - TRANSA = 'T' or 't', op( A ) = A'. - - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - - Unchanged on exit. - - TRANSB - CHARACTER*1. - On entry, TRANSB specifies the form of op( B ) to be used in - - the matrix multiplication as follows: - - TRANSB = 'N' or 'n', op( B ) = B. - - TRANSB = 'T' or 't', op( B ) = B'. - - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of the matrix - - op( A ) and of the matrix C. M must be at least zero. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix - - op( B ) and the number of columns of the matrix C. N must be - - at least zero. - Unchanged on exit. - - K - INTEGER. - On entry, K specifies the number of columns of the matrix - - op( A ) and the number of rows of the matrix op( B ). K must - - be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is - - k when TRANSA = 'N' or 'n', and is m otherwise. - Before entry with TRANSA = 'N' or 'n', the leading m by k - - part of the array A must contain the matrix A, otherwise - - the leading k by m part of the array A must contain the - - matrix A. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. When TRANSA = 'N' or 'n' then - - LDA must be at least max( 1, m ), otherwise LDA must be at - - least max( 1, k ). - Unchanged on exit. - - B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is - - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - - part of the array B must contain the matrix B, otherwise - - the leading n by k part of the array B must contain the - - matrix B. - Unchanged on exit. - - LDB - INTEGER. - On entry, LDB specifies the first dimension of B as declared - - in the calling (sub) program. When TRANSB = 'N' or 'n' then - - LDB must be at least max( 1, k ), otherwise LDB must be at - - least max( 1, n ). - Unchanged on exit. - - BETA - COMPLEX*16 . - On entry, BETA specifies the scalar beta. When BETA is - - supplied as zero then C need not be set on input. - Unchanged on exit. - - C - COMPLEX*16 array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - - contain the matrix C, except when beta is zero, in which - - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - - ( alpha*op( A )*op( B ) + beta*C ). - - LDC - INTEGER. - On entry, LDC specifies the first dimension of C as declared - - in the calling (sub) program. LDC must be at least - - max( 1, m ). - Unchanged on exit. - - - Level 3 Blas routine. - - -- Written on 8-February-1989. - Jack Dongarra, Argonne National Laboratory. - Iain Duff, AERE Harwell. - Jeremy Du Croz, Numerical Algorithms Group Ltd. - Sven Hammarling, Numerical Algorithms Group Ltd. - - - - Set NOTA and NOTB as true if A and B respectively are not - - conjugated or transposed, set CONJA and CONJB as true if A and - - B respectively are to be transposed but not conjugated and set - - NROWA, NCOLA and NROWB as the number of rows and columns of A - - and the number of rows of B respectively. - - - Parameter adjustments - Function Body */ - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] -#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] -#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)] - - nota = lsame_(transa, "N"); - notb = lsame_(transb, "N"); - conja = lsame_(transa, "C"); - conjb = lsame_(transb, "C"); - if (nota) { - nrowa = *m; - ncola = *k; - } else { - nrowa = *k; - ncola = *m; - } - if (notb) { - nrowb = *k; - } else { - nrowb = *n; - } - -/* Test the input parameters. */ - - info = 0; - if (! nota && ! conja && ! lsame_(transa, "T")) { - info = 1; - } else if (! notb && ! conjb && ! lsame_(transb, "T")) { - info = 2; - } else if (*m < 0) { - info = 3; - } else if (*n < 0) { - info = 4; - } else if (*k < 0) { - info = 5; - } else if (*lda < max(1,nrowa)) { - info = 8; - } else if (*ldb < max(1,nrowb)) { - info = 10; - } else if (*ldc < max(1,*m)) { - info = 13; - } - if (info != 0) { - xerbla_("ZGEMM ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || (alpha->r == 0. && alpha->i == 0. || *k == 0) && - (beta->r == 1. && beta->i == 0.)) { - return 0; - } - -/* And when alpha.eq.zero. */ - - if (alpha->r == 0. && alpha->i == 0.) { - if (beta->r == 0. && beta->i == 0.) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - C(i,j).r = 0., C(i,j).i = 0.; -/* L10: */ - } -/* L20: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - i__4 = i + j * c_dim1; - z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__1.i = beta->r * C(i,j).i + beta->i * C(i,j) - .r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L30: */ - } -/* L40: */ - } - } - return 0; - } - -/* Start the operations. */ - - if (notb) { - if (nota) { - -/* Form C := alpha*A*B + beta*C. */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (beta->r == 0. && beta->i == 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - C(i,j).r = 0., C(i,j).i = 0.; -/* L50: */ - } - } else if (beta->r != 1. || beta->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - i__4 = i + j * c_dim1; - z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L60: */ - } - } - i__2 = *k; - for (l = 1; l <= *k; ++l) { - i__3 = l + j * b_dim1; - if (B(l,j).r != 0. || B(l,j).i != 0.) { - i__3 = l + j * b_dim1; - z__1.r = alpha->r * B(l,j).r - alpha->i * B(l,j).i, - z__1.i = alpha->r * B(l,j).i + alpha->i * B(l,j).r; - temp.r = z__1.r, temp.i = z__1.i; - i__3 = *m; - for (i = 1; i <= *m; ++i) { - i__4 = i + j * c_dim1; - i__5 = i + j * c_dim1; - i__6 = i + l * a_dim1; - z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, - z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r; - z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + - z__2.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L70: */ - } - } -/* L80: */ - } -/* L90: */ - } - } else if (conja) { - -/* Form C := alpha*conjg( A' )*B + beta*C. */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp.r = 0., temp.i = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - d_cnjg(&z__3, &A(l,i)); - i__4 = l + j * b_dim1; - z__2.r = z__3.r * B(l,j).r - z__3.i * B(l,j).i, - z__2.i = z__3.r * B(l,j).i + z__3.i * B(l,j) - .r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L100: */ - } - if (beta->r == 0. && beta->i == 0.) { - i__3 = i + j * c_dim1; - z__1.r = alpha->r * temp.r - alpha->i * temp.i, - z__1.i = alpha->r * temp.i + alpha->i * - temp.r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } else { - i__3 = i + j * c_dim1; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, - z__2.i = alpha->r * temp.i + alpha->i * - temp.r; - i__4 = i + j * c_dim1; - z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } -/* L110: */ - } -/* L120: */ - } - } else { - -/* Form C := alpha*A'*B + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp.r = 0., temp.i = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - i__4 = l + i * a_dim1; - i__5 = l + j * b_dim1; - z__2.r = A(l,i).r * B(l,j).r - A(l,i).i * B(l,j) - .i, z__2.i = A(l,i).r * B(l,j).i + A(l,i) - .i * B(l,j).r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L130: */ - } - if (beta->r == 0. && beta->i == 0.) { - i__3 = i + j * c_dim1; - z__1.r = alpha->r * temp.r - alpha->i * temp.i, - z__1.i = alpha->r * temp.i + alpha->i * - temp.r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } else { - i__3 = i + j * c_dim1; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, - z__2.i = alpha->r * temp.i + alpha->i * - temp.r; - i__4 = i + j * c_dim1; - z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } -/* L140: */ - } -/* L150: */ - } - } - } else if (nota) { - if (conjb) { - -/* Form C := alpha*A*conjg( B' ) + beta*C. */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (beta->r == 0. && beta->i == 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - C(i,j).r = 0., C(i,j).i = 0.; -/* L160: */ - } - } else if (beta->r != 1. || beta->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - i__4 = i + j * c_dim1; - z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L170: */ - } - } - i__2 = *k; - for (l = 1; l <= *k; ++l) { - i__3 = j + l * b_dim1; - if (B(j,l).r != 0. || B(j,l).i != 0.) { - d_cnjg(&z__2, &B(j,l)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, - z__1.i = alpha->r * z__2.i + alpha->i * - z__2.r; - temp.r = z__1.r, temp.i = z__1.i; - i__3 = *m; - for (i = 1; i <= *m; ++i) { - i__4 = i + j * c_dim1; - i__5 = i + j * c_dim1; - i__6 = i + l * a_dim1; - z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, - z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r; - z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + - z__2.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L180: */ - } - } -/* L190: */ - } -/* L200: */ - } - } else { - -/* Form C := alpha*A*B' + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (beta->r == 0. && beta->i == 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - C(i,j).r = 0., C(i,j).i = 0.; -/* L210: */ - } - } else if (beta->r != 1. || beta->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * c_dim1; - i__4 = i + j * c_dim1; - z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L220: */ - } - } - i__2 = *k; - for (l = 1; l <= *k; ++l) { - i__3 = j + l * b_dim1; - if (B(j,l).r != 0. || B(j,l).i != 0.) { - i__3 = j + l * b_dim1; - z__1.r = alpha->r * B(j,l).r - alpha->i * B(j,l).i, - z__1.i = alpha->r * B(j,l).i + alpha->i * B(j,l).r; - temp.r = z__1.r, temp.i = z__1.i; - i__3 = *m; - for (i = 1; i <= *m; ++i) { - i__4 = i + j * c_dim1; - i__5 = i + j * c_dim1; - i__6 = i + l * a_dim1; - z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, - z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r; - z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + - z__2.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; -/* L230: */ - } - } -/* L240: */ - } -/* L250: */ - } - } - } else if (conja) { - if (conjb) { - -/* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp.r = 0., temp.i = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - d_cnjg(&z__3, &A(l,i)); - d_cnjg(&z__4, &B(j,l)); - z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, z__2.i = - z__3.r * z__4.i + z__3.i * z__4.r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L260: */ - } - if (beta->r == 0. && beta->i == 0.) { - i__3 = i + j * c_dim1; - z__1.r = alpha->r * temp.r - alpha->i * temp.i, - z__1.i = alpha->r * temp.i + alpha->i * - temp.r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } else { - i__3 = i + j * c_dim1; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, - z__2.i = alpha->r * temp.i + alpha->i * - temp.r; - i__4 = i + j * c_dim1; - z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } -/* L270: */ - } -/* L280: */ - } - } else { - -/* Form C := alpha*conjg( A' )*B' + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp.r = 0., temp.i = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - d_cnjg(&z__3, &A(l,i)); - i__4 = j + l * b_dim1; - z__2.r = z__3.r * B(j,l).r - z__3.i * B(j,l).i, - z__2.i = z__3.r * B(j,l).i + z__3.i * B(j,l) - .r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L290: */ - } - if (beta->r == 0. && beta->i == 0.) { - i__3 = i + j * c_dim1; - z__1.r = alpha->r * temp.r - alpha->i * temp.i, - z__1.i = alpha->r * temp.i + alpha->i * - temp.r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } else { - i__3 = i + j * c_dim1; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, - z__2.i = alpha->r * temp.i + alpha->i * - temp.r; - i__4 = i + j * c_dim1; - z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } -/* L300: */ - } -/* L310: */ - } - } - } else { - if (conjb) { - -/* Form C := alpha*A'*conjg( B' ) + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp.r = 0., temp.i = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - i__4 = l + i * a_dim1; - d_cnjg(&z__3, &B(j,l)); - z__2.r = A(l,i).r * z__3.r - A(l,i).i * z__3.i, - z__2.i = A(l,i).r * z__3.i + A(l,i).i * - z__3.r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L320: */ - } - if (beta->r == 0. && beta->i == 0.) { - i__3 = i + j * c_dim1; - z__1.r = alpha->r * temp.r - alpha->i * temp.i, - z__1.i = alpha->r * temp.i + alpha->i * - temp.r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } else { - i__3 = i + j * c_dim1; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, - z__2.i = alpha->r * temp.i + alpha->i * - temp.r; - i__4 = i + j * c_dim1; - z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } -/* L330: */ - } -/* L340: */ - } - } else { - -/* Form C := alpha*A'*B' + beta*C */ - - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - temp.r = 0., temp.i = 0.; - i__3 = *k; - for (l = 1; l <= *k; ++l) { - i__4 = l + i * a_dim1; - i__5 = j + l * b_dim1; - z__2.r = A(l,i).r * B(j,l).r - A(l,i).i * B(j,l) - .i, z__2.i = A(l,i).r * B(j,l).i + A(l,i) - .i * B(j,l).r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L350: */ - } - if (beta->r == 0. && beta->i == 0.) { - i__3 = i + j * c_dim1; - z__1.r = alpha->r * temp.r - alpha->i * temp.i, - z__1.i = alpha->r * temp.i + alpha->i * - temp.r; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } else { - i__3 = i + j * c_dim1; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, - z__2.i = alpha->r * temp.i + alpha->i * - temp.r; - i__4 = i + j * c_dim1; - z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, - z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - C(i,j).r = z__1.r, C(i,j).i = z__1.i; - } -/* L360: */ - } -/* L370: */ - } - } - } - - return 0; - -/* End of ZGEMM . */ - -} /* zgemm_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,400 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zgemv_(char *trans, integer *m, integer *n, - doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex * - x, integer *incx, doublecomplex *beta, doublecomplex *y, integer * - incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - doublecomplex z__1, z__2, z__3; - - /* Builtin functions */ - void d_cnjg(doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static doublecomplex temp; - static integer lenx, leny, i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical noconj; - - -/* Purpose - ======= - - ZGEMV performs one of the matrix-vector operations - - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or - - y := alpha*conjg( A' )*x + beta*y, - - where alpha and beta are scalars, x and y are vectors and A is an - m by n matrix. - - Parameters - ========== - - TRANS - CHARACTER*1. - On entry, TRANS specifies the operation to be performed as - follows: - - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - - TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - X - COMPLEX*16 array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - COMPLEX*16 . - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - COMPLEX*16 array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - - updated vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! - lsame_(trans, "C")) { - info = 1; - } else if (*m < 0) { - info = 2; - } else if (*n < 0) { - info = 3; - } else if (*lda < max(1,*m)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } else if (*incy == 0) { - info = 11; - } - if (info != 0) { - xerbla_("ZGEMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == - 1. && beta->i == 0.)) { - return 0; - } - - noconj = lsame_(trans, "T"); - -/* Set LENX and LENY, the lengths of the vectors x and y, and set - - up the start points in X and Y. */ - - if (lsame_(trans, "N")) { - lenx = *n; - leny = *m; - } else { - lenx = *m; - leny = *n; - } - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (lenx - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (leny - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. - - First form y := beta*y. */ - - if (beta->r != 1. || beta->i != 0.) { - if (*incy == 1) { - if (beta->r == 0. && beta->i == 0.) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = i; - Y(i).r = 0., Y(i).i = 0.; -/* L10: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = i; - i__3 = i; - z__1.r = beta->r * Y(i).r - beta->i * Y(i).i, - z__1.i = beta->r * Y(i).i + beta->i * Y(i) - .r; - Y(i).r = z__1.r, Y(i).i = z__1.i; -/* L20: */ - } - } - } else { - iy = ky; - if (beta->r == 0. && beta->i == 0.) { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = iy; - Y(iy).r = 0., Y(iy).i = 0.; - iy += *incy; -/* L30: */ - } - } else { - i__1 = leny; - for (i = 1; i <= leny; ++i) { - i__2 = iy; - i__3 = iy; - z__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, - z__1.i = beta->r * Y(iy).i + beta->i * Y(iy) - .r; - Y(iy).r = z__1.r, Y(iy).i = z__1.i; - iy += *incy; -/* L40: */ - } - } - } - } - if (alpha->r == 0. && alpha->i == 0.) { - return 0; - } - if (lsame_(trans, "N")) { - -/* Form y := alpha*A*x + y. */ - - jx = kx; - if (*incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - if (X(jx).r != 0. || X(jx).i != 0.) { - i__2 = jx; - z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - z__1.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - temp.r = z__1.r, temp.i = z__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - z__2.i = temp.r * A(i,j).i + temp.i * A(i,j) - .r; - z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + - z__2.i; - Y(i).r = z__1.r, Y(i).i = z__1.i; -/* L50: */ - } - } - jx += *incx; -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - if (X(jx).r != 0. || X(jx).i != 0.) { - i__2 = jx; - z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - z__1.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - temp.r = z__1.r, temp.i = z__1.i; - iy = ky; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = iy; - i__4 = iy; - i__5 = i + j * a_dim1; - z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - z__2.i = temp.r * A(i,j).i + temp.i * A(i,j) - .r; - z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + - z__2.i; - Y(iy).r = z__1.r, Y(iy).i = z__1.i; - iy += *incy; -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } else { - -/* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. - */ - - jy = ky; - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp.r = 0., temp.i = 0.; - if (noconj) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i; - z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(i) - .i, z__2.i = A(i,j).r * X(i).i + A(i,j) - .i * X(i).r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L90: */ - } - } else { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - d_cnjg(&z__3, &A(i,j)); - i__3 = i; - z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, - z__2.i = z__3.r * X(i).i + z__3.i * X(i) - .r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L100: */ - } - } - i__2 = jy; - i__3 = jy; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i = - alpha->r * temp.i + alpha->i * temp.r; - z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i; - Y(jy).r = z__1.r, Y(jy).i = z__1.i; - jy += *incy; -/* L110: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - temp.r = 0., temp.i = 0.; - ix = kx; - if (noconj) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = ix; - z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(ix) - .i, z__2.i = A(i,j).r * X(ix).i + A(i,j) - .i * X(ix).r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; - ix += *incx; -/* L120: */ - } - } else { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - d_cnjg(&z__3, &A(i,j)); - i__3 = ix; - z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, - z__2.i = z__3.r * X(ix).i + z__3.i * X(ix) - .r; - z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i; - temp.r = z__1.r, temp.i = z__1.i; - ix += *incx; -/* L130: */ - } - } - i__2 = jy; - i__3 = jy; - z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i = - alpha->r * temp.i + alpha->i * temp.r; - z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i; - Y(jy).r = z__1.r, Y(jy).i = z__1.i; - jy += *incy; -/* L140: */ - } - } - } - - return 0; - -/* End of ZGEMV . */ - -} /* zgemv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgerc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgerc.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgerc.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgerc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,206 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zgerc_(integer *m, integer *n, doublecomplex *alpha, - doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, - doublecomplex *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - doublecomplex z__1, z__2; - - /* Builtin functions */ - void d_cnjg(doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static doublecomplex temp; - static integer i, j, ix, jy, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - ZGERC performs the rank 1 operation - - A := alpha*x*conjg( y' ) + A, - - where alpha is a scalar, x is an m element vector, y is an n element - - vector and A is an m by n matrix. - - Parameters - ========== - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - COMPLEX*16 array of dimension at least - ( 1 + ( m - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the m - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. On exit, A is - overwritten by the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (*m < 0) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*m)) { - info = 9; - } - if (info != 0) { - xerbla_("ZGERC ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) { - return 0; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (*incy > 0) { - jy = 1; - } else { - jy = 1 - (*n - 1) * *incy; - } - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0. || Y(jy).i != 0.) { - d_cnjg(&z__2, &Y(jy)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = - alpha->r * z__2.i + alpha->i * z__2.r; - temp.r = z__1.r, temp.i = z__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - z__2.r = X(i).r * temp.r - X(i).i * temp.i, z__2.i = - X(i).r * temp.i + X(i).i * temp.r; - z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; -/* L10: */ - } - } - jy += *incy; -/* L20: */ - } - } else { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*m - 1) * *incx; - } - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0. || Y(jy).i != 0.) { - d_cnjg(&z__2, &Y(jy)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = - alpha->r * z__2.i + alpha->i * z__2.r; - temp.r = z__1.r, temp.i = z__1.i; - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - z__2.r = X(ix).r * temp.r - X(ix).i * temp.i, z__2.i = - X(ix).r * temp.i + X(ix).i * temp.r; - z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; - ix += *incx; -/* L30: */ - } - } - jy += *incy; -/* L40: */ - } - } - - return 0; - -/* End of ZGERC . */ - -} /* zgerc_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgeru.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgeru.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgeru.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgeru.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,203 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zgeru_(integer *m, integer *n, doublecomplex *alpha, - doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, - doublecomplex *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - doublecomplex z__1, z__2; - - /* Local variables */ - static integer info; - static doublecomplex temp; - static integer i, j, ix, jy, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - ZGERU performs the rank 1 operation - - A := alpha*x*y' + A, - - where alpha is a scalar, x is an m element vector, y is an n element - - vector and A is an m by n matrix. - - Parameters - ========== - - M - INTEGER. - On entry, M specifies the number of rows of the matrix A. - M must be at least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of the matrix A. - - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - COMPLEX*16 array of dimension at least - ( 1 + ( m - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the m - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, n ). - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. On exit, A is - overwritten by the updated matrix. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, m ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (*m < 0) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*m)) { - info = 9; - } - if (info != 0) { - xerbla_("ZGERU ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) { - return 0; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (*incy > 0) { - jy = 1; - } else { - jy = 1 - (*n - 1) * *incy; - } - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0. || Y(jy).i != 0.) { - i__2 = jy; - z__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, z__1.i = - alpha->r * Y(jy).i + alpha->i * Y(jy).r; - temp.r = z__1.r, temp.i = z__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - z__2.r = X(i).r * temp.r - X(i).i * temp.i, z__2.i = - X(i).r * temp.i + X(i).i * temp.r; - z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; -/* L10: */ - } - } - jy += *incy; -/* L20: */ - } - } else { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*m - 1) * *incx; - } - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jy; - if (Y(jy).r != 0. || Y(jy).i != 0.) { - i__2 = jy; - z__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, z__1.i = - alpha->r * Y(jy).i + alpha->i * Y(jy).r; - temp.r = z__1.r, temp.i = z__1.i; - ix = kx; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - z__2.r = X(ix).r * temp.r - X(ix).i * temp.i, z__2.i = - X(ix).r * temp.i + X(ix).i * temp.r; - z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; - ix += *incx; -/* L30: */ - } - } - jy += *incy; -/* L40: */ - } - } - - return 0; - -/* End of ZGERU . */ - -} /* zgeru_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zhemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zhemv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zhemv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zhemv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,421 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zhemv_(char *uplo, integer *n, doublecomplex *alpha, - doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, - doublecomplex *beta, doublecomplex *y, integer *incy) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - doublereal d__1; - doublecomplex z__1, z__2, z__3, z__4; - - /* Builtin functions */ - void d_cnjg(doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static doublecomplex temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - ZHEMV performs the matrix-vector operation - - y := alpha*A*x + beta*y, - - where alpha and beta are scalars, x and y are n element vectors and - A is an n by n hermitian matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the hermitian matrix and the strictly - lower triangular part of A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the hermitian matrix and the strictly - upper triangular part of A is not referenced. - Note that the imaginary parts of the diagonal elements need - - not be set and are assumed to be zero. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - BETA - COMPLEX*16 . - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - Unchanged on exit. - - Y - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. On exit, Y is overwritten by the updated - vector y. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*lda < max(1,*n)) { - info = 5; - } else if (*incx == 0) { - info = 7; - } else if (*incy == 0) { - info = 10; - } - if (info != 0) { - xerbla_("ZHEMV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && - beta->i == 0.)) { - return 0; - } - -/* Set up the start points in X and Y. */ - - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. - - First form y := beta*y. */ - - if (beta->r != 1. || beta->i != 0.) { - if (*incy == 1) { - if (beta->r == 0. && beta->i == 0.) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - Y(i).r = 0., Y(i).i = 0.; -/* L10: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - z__1.r = beta->r * Y(i).r - beta->i * Y(i).i, - z__1.i = beta->r * Y(i).i + beta->i * Y(i) - .r; - Y(i).r = z__1.r, Y(i).i = z__1.i; -/* L20: */ - } - } - } else { - iy = ky; - if (beta->r == 0. && beta->i == 0.) { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - Y(iy).r = 0., Y(iy).i = 0.; - iy += *incy; -/* L30: */ - } - } else { - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = iy; - i__3 = iy; - z__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, - z__1.i = beta->r * Y(iy).i + beta->i * Y(iy) - .r; - Y(iy).r = z__1.r, Y(iy).i = z__1.i; - iy += *incy; -/* L40: */ - } - } - } - } - if (alpha->r == 0. && alpha->i == 0.) { - return 0; - } - if (lsame_(uplo, "U")) { - -/* Form y when A is stored in upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - z__1.r = alpha->r * X(j).r - alpha->i * X(j).i, z__1.i = - alpha->r * X(j).i + alpha->i * X(j).r; - temp1.r = z__1.r, temp1.i = z__1.i; - temp2.r = 0., temp2.i = 0.; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + z__2.i; - Y(i).r = z__1.r, Y(i).i = z__1.i; - d_cnjg(&z__3, &A(i,j)); - i__3 = i; - z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, z__2.i = - z__3.r * X(i).i + z__3.i * X(i).r; - z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; - temp2.r = z__1.r, temp2.i = z__1.i; -/* L50: */ - } - i__2 = j; - i__3 = j; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; - z__2.r = Y(j).r + z__3.r, z__2.i = Y(j).i + z__3.i; - z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = - alpha->r * temp2.i + alpha->i * temp2.r; - z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; - Y(j).r = z__1.r, Y(j).i = z__1.i; -/* L60: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, z__1.i = - alpha->r * X(jx).i + alpha->i * X(jx).r; - temp1.r = z__1.r, temp1.i = z__1.i; - temp2.r = 0., temp2.i = 0.; - ix = kx; - iy = ky; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = iy; - i__4 = iy; - i__5 = i + j * a_dim1; - z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + z__2.i; - Y(iy).r = z__1.r, Y(iy).i = z__1.i; - d_cnjg(&z__3, &A(i,j)); - i__3 = ix; - z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, z__2.i = - z__3.r * X(ix).i + z__3.i * X(ix).r; - z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; - temp2.r = z__1.r, temp2.i = z__1.i; - ix += *incx; - iy += *incy; -/* L70: */ - } - i__2 = jy; - i__3 = jy; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; - z__2.r = Y(jy).r + z__3.r, z__2.i = Y(jy).i + z__3.i; - z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = - alpha->r * temp2.i + alpha->i * temp2.r; - z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; - Y(jy).r = z__1.r, Y(jy).i = z__1.i; - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } else { - -/* Form y when A is stored in lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - z__1.r = alpha->r * X(j).r - alpha->i * X(j).i, z__1.i = - alpha->r * X(j).i + alpha->i * X(j).r; - temp1.r = z__1.r, temp1.i = z__1.i; - temp2.r = 0., temp2.i = 0.; - i__2 = j; - i__3 = j; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; - z__1.r = Y(j).r + z__2.r, z__1.i = Y(j).i + z__2.i; - Y(j).r = z__1.r, Y(j).i = z__1.i; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + z__2.i; - Y(i).r = z__1.r, Y(i).i = z__1.i; - d_cnjg(&z__3, &A(i,j)); - i__3 = i; - z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, z__2.i = - z__3.r * X(i).i + z__3.i * X(i).r; - z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; - temp2.r = z__1.r, temp2.i = z__1.i; -/* L90: */ - } - i__2 = j; - i__3 = j; - z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = - alpha->r * temp2.i + alpha->i * temp2.r; - z__1.r = Y(j).r + z__2.r, z__1.i = Y(j).i + z__2.i; - Y(j).r = z__1.r, Y(j).i = z__1.i; -/* L100: */ - } - } else { - jx = kx; - jy = ky; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, z__1.i = - alpha->r * X(jx).i + alpha->i * X(jx).r; - temp1.r = z__1.r, temp1.i = z__1.i; - temp2.r = 0., temp2.i = 0.; - i__2 = jy; - i__3 = jy; - i__4 = j + j * a_dim1; - d__1 = A(j,j).r; - z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; - z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i; - Y(jy).r = z__1.r, Y(jy).i = z__1.i; - ix = jx; - iy = jy; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - iy += *incy; - i__3 = iy; - i__4 = iy; - i__5 = i + j * a_dim1; - z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, - z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j) - .r; - z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + z__2.i; - Y(iy).r = z__1.r, Y(iy).i = z__1.i; - d_cnjg(&z__3, &A(i,j)); - i__3 = ix; - z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, z__2.i = - z__3.r * X(ix).i + z__3.i * X(ix).r; - z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; - temp2.r = z__1.r, temp2.i = z__1.i; -/* L110: */ - } - i__2 = jy; - i__3 = jy; - z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = - alpha->r * temp2.i + alpha->i * temp2.r; - z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i; - Y(jy).r = z__1.r, Y(jy).i = z__1.i; - jx += *incx; - jy += *incy; -/* L120: */ - } - } - } - - return 0; - -/* End of ZHEMV . */ - -} /* zhemv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zher2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zher2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zher2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zher2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,437 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zher2_(char *uplo, integer *n, doublecomplex *alpha, - doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, - doublecomplex *a, integer *lda) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - doublereal d__1; - doublecomplex z__1, z__2, z__3, z__4; - - /* Builtin functions */ - void d_cnjg(doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static doublecomplex temp1, temp2; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, iy, jx, jy, kx, ky; - extern /* Subroutine */ int xerbla_(char *, integer *); - - -/* Purpose - ======= - - ZHER2 performs the hermitian rank 2 operation - - A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, - - where alpha is a scalar, x and y are n element vectors and A is an n - - by n hermitian matrix. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the upper or lower - triangular part of the array A is to be referenced as - follows: - - UPLO = 'U' or 'u' Only the upper triangular part of A - is to be referenced. - - UPLO = 'L' or 'l' Only the lower triangular part of A - is to be referenced. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. - Unchanged on exit. - - X - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element vector x. - Unchanged on exit. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - Y - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCY ) ). - Before entry, the incremented array Y must contain the n - element vector y. - Unchanged on exit. - - INCY - INTEGER. - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular part of the hermitian matrix and the strictly - lower triangular part of A is not referenced. On exit, the - upper triangular part of the array A is overwritten by the - upper triangular part of the updated matrix. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular part of the hermitian matrix and the strictly - upper triangular part of A is not referenced. On exit, the - lower triangular part of the array A is overwritten by the - lower triangular part of the updated matrix. - Note that the imaginary parts of the diagonal elements need - - not be set, they are assumed to be zero, and on exit they - are set to zero. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] -#define Y(I) y[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (*n < 0) { - info = 2; - } else if (*incx == 0) { - info = 5; - } else if (*incy == 0) { - info = 7; - } else if (*lda < max(1,*n)) { - info = 9; - } - if (info != 0) { - xerbla_("ZHER2 ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0 || alpha->r == 0. && alpha->i == 0.) { - return 0; - } - -/* Set up the start points in X and Y if the increments are not both - - unity. */ - - if (*incx != 1 || *incy != 1) { - if (*incx > 0) { - kx = 1; - } else { - kx = 1 - (*n - 1) * *incx; - } - if (*incy > 0) { - ky = 1; - } else { - ky = 1 - (*n - 1) * *incy; - } - jx = kx; - jy = ky; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through the triangular part - of A. */ - - if (lsame_(uplo, "U")) { - -/* Form A when A is stored in the upper triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - i__3 = j; - if (X(j).r != 0. || X(j).i != 0. || (Y(j).r != 0. || - Y(j).i != 0.)) { - d_cnjg(&z__2, &Y(j)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = - alpha->r * z__2.i + alpha->i * z__2.r; - temp1.r = z__1.r, temp1.i = z__1.i; - i__2 = j; - z__2.r = alpha->r * X(j).r - alpha->i * X(j).i, - z__2.i = alpha->r * X(j).i + alpha->i * X(j) - .r; - d_cnjg(&z__1, &z__2); - temp2.r = z__1.r, temp2.i = z__1.i; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - z__3.r = X(i).r * temp1.r - X(i).i * temp1.i, - z__3.i = X(i).r * temp1.i + X(i).i * - temp1.r; - z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + - z__3.i; - i__6 = i; - z__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, - z__4.i = Y(i).r * temp2.i + Y(i).i * - temp2.r; - z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; -/* L10: */ - } - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = j; - z__2.r = X(j).r * temp1.r - X(j).i * temp1.i, - z__2.i = X(j).r * temp1.i + X(j).i * - temp1.r; - i__5 = j; - z__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, - z__3.i = Y(j).r * temp2.i + Y(j).i * - temp2.r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - d__1 = A(j,j).r + z__1.r; - A(j,j).r = d__1, A(j,j).i = 0.; - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.; - } -/* L20: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - i__3 = jy; - if (X(jx).r != 0. || X(jx).i != 0. || (Y(jy).r != 0. || - Y(jy).i != 0.)) { - d_cnjg(&z__2, &Y(jy)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = - alpha->r * z__2.i + alpha->i * z__2.r; - temp1.r = z__1.r, temp1.i = z__1.i; - i__2 = jx; - z__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - z__2.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - d_cnjg(&z__1, &z__2); - temp2.r = z__1.r, temp2.i = z__1.i; - ix = kx; - iy = ky; - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - z__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, - z__3.i = X(ix).r * temp1.i + X(ix).i * - temp1.r; - z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + - z__3.i; - i__6 = iy; - z__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, - z__4.i = Y(iy).r * temp2.i + Y(iy).i * - temp2.r; - z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; - ix += *incx; - iy += *incy; -/* L30: */ - } - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = jx; - z__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, - z__2.i = X(jx).r * temp1.i + X(jx).i * - temp1.r; - i__5 = jy; - z__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, - z__3.i = Y(jy).r * temp2.i + Y(jy).i * - temp2.r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - d__1 = A(j,j).r + z__1.r; - A(j,j).r = d__1, A(j,j).i = 0.; - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.; - } - jx += *incx; - jy += *incy; -/* L40: */ - } - } - } else { - -/* Form A when A is stored in the lower triangle. */ - - if (*incx == 1 && *incy == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - i__3 = j; - if (X(j).r != 0. || X(j).i != 0. || (Y(j).r != 0. || - Y(j).i != 0.)) { - d_cnjg(&z__2, &Y(j)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = - alpha->r * z__2.i + alpha->i * z__2.r; - temp1.r = z__1.r, temp1.i = z__1.i; - i__2 = j; - z__2.r = alpha->r * X(j).r - alpha->i * X(j).i, - z__2.i = alpha->r * X(j).i + alpha->i * X(j) - .r; - d_cnjg(&z__1, &z__2); - temp2.r = z__1.r, temp2.i = z__1.i; - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = j; - z__2.r = X(j).r * temp1.r - X(j).i * temp1.i, - z__2.i = X(j).r * temp1.i + X(j).i * - temp1.r; - i__5 = j; - z__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, - z__3.i = Y(j).r * temp2.i + Y(j).i * - temp2.r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - d__1 = A(j,j).r + z__1.r; - A(j,j).r = d__1, A(j,j).i = 0.; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = i; - z__3.r = X(i).r * temp1.r - X(i).i * temp1.i, - z__3.i = X(i).r * temp1.i + X(i).i * - temp1.r; - z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + - z__3.i; - i__6 = i; - z__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, - z__4.i = Y(i).r * temp2.i + Y(i).i * - temp2.r; - z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; -/* L50: */ - } - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.; - } -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - i__3 = jy; - if (X(jx).r != 0. || X(jx).i != 0. || (Y(jy).r != 0. || - Y(jy).i != 0.)) { - d_cnjg(&z__2, &Y(jy)); - z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = - alpha->r * z__2.i + alpha->i * z__2.r; - temp1.r = z__1.r, temp1.i = z__1.i; - i__2 = jx; - z__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, - z__2.i = alpha->r * X(jx).i + alpha->i * X(jx) - .r; - d_cnjg(&z__1, &z__2); - temp2.r = z__1.r, temp2.i = z__1.i; - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - i__4 = jx; - z__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, - z__2.i = X(jx).r * temp1.i + X(jx).i * - temp1.r; - i__5 = jy; - z__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, - z__3.i = Y(jy).r * temp2.i + Y(jy).i * - temp2.r; - z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; - d__1 = A(j,j).r + z__1.r; - A(j,j).r = d__1, A(j,j).i = 0.; - ix = jx; - iy = jy; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - iy += *incy; - i__3 = i + j * a_dim1; - i__4 = i + j * a_dim1; - i__5 = ix; - z__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, - z__3.i = X(ix).r * temp1.i + X(ix).i * - temp1.r; - z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + - z__3.i; - i__6 = iy; - z__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, - z__4.i = Y(iy).r * temp2.i + Y(iy).i * - temp2.r; - z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; - A(i,j).r = z__1.r, A(i,j).i = z__1.i; -/* L70: */ - } - } else { - i__2 = j + j * a_dim1; - i__3 = j + j * a_dim1; - d__1 = A(j,j).r; - A(j,j).r = d__1, A(j,j).i = 0.; - } - jx += *incx; - jy += *incy; -/* L80: */ - } - } - } - - return 0; - -/* End of ZHER2 . */ - -} /* zher2_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zscal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zscal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zscal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,70 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int zscal_(integer *n, doublecomplex *za, doublecomplex *zx, - integer *incx) -{ - - - /* System generated locals */ - integer i__1, i__2, i__3; - doublecomplex z__1; - - /* Local variables */ - static integer i, ix; - - -/* scales a vector by a constant. - jack dongarra, 3/11/78. - modified 3/93 to return if incx .le. 0. - modified 12/3/93, array(1) declarations changed to array(*) - - - - Parameter adjustments - Function Body */ -#define ZX(I) zx[(I)-1] - - - if (*n <= 0 || *incx <= 0) { - return 0; - } - if (*incx == 1) { - goto L20; - } - -/* code for increment not equal to 1 */ - - ix = 1; - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = ix; - i__3 = ix; - z__1.r = za->r * ZX(ix).r - za->i * ZX(ix).i, z__1.i = za->r * ZX( - ix).i + za->i * ZX(ix).r; - ZX(ix).r = z__1.r, ZX(ix).i = z__1.i; - ix += *incx; -/* L10: */ - } - return 0; - -/* code for increment equal to 1 */ - -L20: - i__1 = *n; - for (i = 1; i <= *n; ++i) { - i__2 = i; - i__3 = i; - z__1.r = za->r * ZX(i).r - za->i * ZX(i).i, z__1.i = za->r * ZX( - i).i + za->i * ZX(i).r; - ZX(i).r = z__1.r, ZX(i).i = z__1.i; -/* L30: */ - } - return 0; -} /* zscal_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,692 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Table of constant values */ - -static doublecomplex c_b1 = {1.,0.}; - -/* Subroutine */ int ztrsm_(char *side, char *uplo, char *transa, char *diag, - integer *m, integer *n, doublecomplex *alpha, doublecomplex *a, - integer *lda, doublecomplex *b, integer *ldb) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, - i__6, i__7; - doublecomplex z__1, z__2, z__3; - - /* Builtin functions */ - void z_div(doublecomplex *, doublecomplex *, doublecomplex *), d_cnjg( - doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static doublecomplex temp; - static integer i, j, k; - static logical lside; - extern logical lsame_(char *, char *); - static integer nrowa; - static logical upper; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical noconj, nounit; - - -/* Purpose - ======= - - ZTRSM solves one of the matrix equations - - op( A )*X = alpha*B, or X*op( A ) = alpha*B, - - where alpha is a scalar, X and B are m by n matrices, A is a unit, or - - non-unit, upper or lower triangular matrix and op( A ) is one of - - - op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). - - The matrix X is overwritten on B. - - Parameters - ========== - - SIDE - CHARACTER*1. - On entry, SIDE specifies whether op( A ) appears on the left - - or right of X as follows: - - SIDE = 'L' or 'l' op( A )*X = alpha*B. - - SIDE = 'R' or 'r' X*op( A ) = alpha*B. - - Unchanged on exit. - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the matrix A is an upper or - - lower triangular matrix as follows: - - UPLO = 'U' or 'u' A is an upper triangular matrix. - - UPLO = 'L' or 'l' A is a lower triangular matrix. - - Unchanged on exit. - - TRANSA - CHARACTER*1. - On entry, TRANSA specifies the form of op( A ) to be used in - - the matrix multiplication as follows: - - TRANSA = 'N' or 'n' op( A ) = A. - - TRANSA = 'T' or 't' op( A ) = A'. - - TRANSA = 'C' or 'c' op( A ) = conjg( A' ). - - Unchanged on exit. - - DIAG - CHARACTER*1. - On entry, DIAG specifies whether or not A is unit triangular - - as follows: - - DIAG = 'U' or 'u' A is assumed to be unit triangular. - - DIAG = 'N' or 'n' A is not assumed to be unit - triangular. - - Unchanged on exit. - - M - INTEGER. - On entry, M specifies the number of rows of B. M must be at - - least zero. - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the number of columns of B. N must be - - at least zero. - Unchanged on exit. - - ALPHA - COMPLEX*16 . - On entry, ALPHA specifies the scalar alpha. When alpha is - - zero then A is not referenced and B need not be set before - - entry. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m - - when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. - - Before entry with UPLO = 'U' or 'u', the leading k by k - - upper triangular part of the array A must contain the upper - - triangular matrix and the strictly lower triangular part of - - A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading k by k - - lower triangular part of the array A must contain the lower - - triangular matrix and the strictly upper triangular part of - - A is not referenced. - Note that when DIAG = 'U' or 'u', the diagonal elements of - - A are not referenced either, but are assumed to be unity. - - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. When SIDE = 'L' or 'l' then - - LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' - - then LDA must be at least max( 1, n ). - Unchanged on exit. - - B - COMPLEX*16 array of DIMENSION ( LDB, n ). - Before entry, the leading m by n part of the array B must - - contain the right-hand side matrix B, and on exit is - - overwritten by the solution matrix X. - - LDB - INTEGER. - On entry, LDB specifies the first dimension of B as declared - - in the calling (sub) program. LDB must be at least - - max( 1, m ). - Unchanged on exit. - - - Level 3 Blas routine. - - -- Written on 8-February-1989. - Jack Dongarra, Argonne National Laboratory. - Iain Duff, AERE Harwell. - Jeremy Du Croz, Numerical Algorithms Group Ltd. - Sven Hammarling, Numerical Algorithms Group Ltd. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] -#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] - - lside = lsame_(side, "L"); - if (lside) { - nrowa = *m; - } else { - nrowa = *n; - } - noconj = lsame_(transa, "T"); - nounit = lsame_(diag, "N"); - upper = lsame_(uplo, "U"); - - info = 0; - if (! lside && ! lsame_(side, "R")) { - info = 1; - } else if (! upper && ! lsame_(uplo, "L")) { - info = 2; - } else if (! lsame_(transa, "N") && ! lsame_(transa, "T") - && ! lsame_(transa, "C")) { - info = 3; - } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { - info = 4; - } else if (*m < 0) { - info = 5; - } else if (*n < 0) { - info = 6; - } else if (*lda < max(1,nrowa)) { - info = 9; - } else if (*ldb < max(1,*m)) { - info = 11; - } - if (info != 0) { - xerbla_("ZTRSM ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0) { - return 0; - } - -/* And when alpha.eq.zero. */ - - if (alpha->r == 0. && alpha->i == 0.) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - B(i,j).r = 0., B(i,j).i = 0.; -/* L10: */ - } -/* L20: */ - } - return 0; - } - -/* Start the operations. */ - - if (lside) { - if (lsame_(transa, "N")) { - -/* Form B := alpha*inv( A )*B. */ - - if (upper) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (alpha->r != 1. || alpha->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j) - .i, z__1.i = alpha->r * B(i,j).i + - alpha->i * B(i,j).r; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L30: */ - } - } - for (k = *m; k >= 1; --k) { - i__2 = k + j * b_dim1; - if (B(k,j).r != 0. || B(k,j).i != 0.) { - if (nounit) { - i__2 = k + j * b_dim1; - z_div(&z__1, &B(k,j), &A(k,k)); - B(k,j).r = z__1.r, B(k,j).i = z__1.i; - } - i__2 = k - 1; - for (i = 1; i <= k-1; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - i__5 = k + j * b_dim1; - i__6 = i + k * a_dim1; - z__2.r = B(k,j).r * A(i,k).r - B(k,j).i * - A(i,k).i, z__2.i = B(k,j).r * A(i,k).i + B(k,j).i * A(i,k).r; - z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j) - .i - z__2.i; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L40: */ - } - } -/* L50: */ - } -/* L60: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (alpha->r != 1. || alpha->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j) - .i, z__1.i = alpha->r * B(i,j).i + - alpha->i * B(i,j).r; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L70: */ - } - } - i__2 = *m; - for (k = 1; k <= *m; ++k) { - i__3 = k + j * b_dim1; - if (B(k,j).r != 0. || B(k,j).i != 0.) { - if (nounit) { - i__3 = k + j * b_dim1; - z_div(&z__1, &B(k,j), &A(k,k)); - B(k,j).r = z__1.r, B(k,j).i = z__1.i; - } - i__3 = *m; - for (i = k + 1; i <= *m; ++i) { - i__4 = i + j * b_dim1; - i__5 = i + j * b_dim1; - i__6 = k + j * b_dim1; - i__7 = i + k * a_dim1; - z__2.r = B(k,j).r * A(i,k).r - B(k,j).i * - A(i,k).i, z__2.i = B(k,j).r * A(i,k).i + B(k,j).i * A(i,k).r; - z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j) - .i - z__2.i; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L80: */ - } - } -/* L90: */ - } -/* L100: */ - } - } - } else { - -/* Form B := alpha*inv( A' )*B - or B := alpha*inv( conjg( A' ) )*B. */ - - if (upper) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j).i, - z__1.i = alpha->r * B(i,j).i + alpha->i * B(i,j).r; - temp.r = z__1.r, temp.i = z__1.i; - if (noconj) { - i__3 = i - 1; - for (k = 1; k <= i-1; ++k) { - i__4 = k + i * a_dim1; - i__5 = k + j * b_dim1; - z__2.r = A(k,i).r * B(k,j).r - A(k,i).i * - B(k,j).i, z__2.i = A(k,i).r * B(k,j).i + A(k,i).i * B(k,j).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L110: */ - } - if (nounit) { - z_div(&z__1, &temp, &A(i,i)); - temp.r = z__1.r, temp.i = z__1.i; - } - } else { - i__3 = i - 1; - for (k = 1; k <= i-1; ++k) { - d_cnjg(&z__3, &A(k,i)); - i__4 = k + j * b_dim1; - z__2.r = z__3.r * B(k,j).r - z__3.i * B(k,j) - .i, z__2.i = z__3.r * B(k,j).i + - z__3.i * B(k,j).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L120: */ - } - if (nounit) { - d_cnjg(&z__2, &A(i,i)); - z_div(&z__1, &temp, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - } - i__3 = i + j * b_dim1; - B(i,j).r = temp.r, B(i,j).i = temp.i; -/* L130: */ - } -/* L140: */ - } - } else { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - for (i = *m; i >= 1; --i) { - i__2 = i + j * b_dim1; - z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j).i, - z__1.i = alpha->r * B(i,j).i + alpha->i * B(i,j).r; - temp.r = z__1.r, temp.i = z__1.i; - if (noconj) { - i__2 = *m; - for (k = i + 1; k <= *m; ++k) { - i__3 = k + i * a_dim1; - i__4 = k + j * b_dim1; - z__2.r = A(k,i).r * B(k,j).r - A(k,i).i * - B(k,j).i, z__2.i = A(k,i).r * B(k,j).i + A(k,i).i * B(k,j).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L150: */ - } - if (nounit) { - z_div(&z__1, &temp, &A(i,i)); - temp.r = z__1.r, temp.i = z__1.i; - } - } else { - i__2 = *m; - for (k = i + 1; k <= *m; ++k) { - d_cnjg(&z__3, &A(k,i)); - i__3 = k + j * b_dim1; - z__2.r = z__3.r * B(k,j).r - z__3.i * B(k,j) - .i, z__2.i = z__3.r * B(k,j).i + - z__3.i * B(k,j).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L160: */ - } - if (nounit) { - d_cnjg(&z__2, &A(i,i)); - z_div(&z__1, &temp, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - } - i__2 = i + j * b_dim1; - B(i,j).r = temp.r, B(i,j).i = temp.i; -/* L170: */ - } -/* L180: */ - } - } - } - } else { - if (lsame_(transa, "N")) { - -/* Form B := alpha*B*inv( A ). */ - - if (upper) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - if (alpha->r != 1. || alpha->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j) - .i, z__1.i = alpha->r * B(i,j).i + - alpha->i * B(i,j).r; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L190: */ - } - } - i__2 = j - 1; - for (k = 1; k <= j-1; ++k) { - i__3 = k + j * a_dim1; - if (A(k,j).r != 0. || A(k,j).i != 0.) { - i__3 = *m; - for (i = 1; i <= *m; ++i) { - i__4 = i + j * b_dim1; - i__5 = i + j * b_dim1; - i__6 = k + j * a_dim1; - i__7 = i + k * b_dim1; - z__2.r = A(k,j).r * B(i,k).r - A(k,j).i * - B(i,k).i, z__2.i = A(k,j).r * B(i,k).i + A(k,j).i * B(i,k).r; - z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j) - .i - z__2.i; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L200: */ - } - } -/* L210: */ - } - if (nounit) { - z_div(&z__1, &c_b1, &A(j,j)); - temp.r = z__1.r, temp.i = z__1.i; - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - z__1.r = temp.r * B(i,j).r - temp.i * B(i,j).i, - z__1.i = temp.r * B(i,j).i + temp.i * B(i,j).r; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L220: */ - } - } -/* L230: */ - } - } else { - for (j = *n; j >= 1; --j) { - if (alpha->r != 1. || alpha->i != 0.) { - i__1 = *m; - for (i = 1; i <= *m; ++i) { - i__2 = i + j * b_dim1; - i__3 = i + j * b_dim1; - z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j) - .i, z__1.i = alpha->r * B(i,j).i + - alpha->i * B(i,j).r; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L240: */ - } - } - i__1 = *n; - for (k = j + 1; k <= *n; ++k) { - i__2 = k + j * a_dim1; - if (A(k,j).r != 0. || A(k,j).i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - i__5 = k + j * a_dim1; - i__6 = i + k * b_dim1; - z__2.r = A(k,j).r * B(i,k).r - A(k,j).i * - B(i,k).i, z__2.i = A(k,j).r * B(i,k).i + A(k,j).i * B(i,k).r; - z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j) - .i - z__2.i; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L250: */ - } - } -/* L260: */ - } - if (nounit) { - z_div(&z__1, &c_b1, &A(j,j)); - temp.r = z__1.r, temp.i = z__1.i; - i__1 = *m; - for (i = 1; i <= *m; ++i) { - i__2 = i + j * b_dim1; - i__3 = i + j * b_dim1; - z__1.r = temp.r * B(i,j).r - temp.i * B(i,j).i, - z__1.i = temp.r * B(i,j).i + temp.i * B(i,j).r; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L270: */ - } - } -/* L280: */ - } - } - } else { - -/* Form B := alpha*B*inv( A' ) - or B := alpha*B*inv( conjg( A' ) ). */ - - if (upper) { - for (k = *n; k >= 1; --k) { - if (nounit) { - if (noconj) { - z_div(&z__1, &c_b1, &A(k,k)); - temp.r = z__1.r, temp.i = z__1.i; - } else { - d_cnjg(&z__2, &A(k,k)); - z_div(&z__1, &c_b1, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - i__1 = *m; - for (i = 1; i <= *m; ++i) { - i__2 = i + k * b_dim1; - i__3 = i + k * b_dim1; - z__1.r = temp.r * B(i,k).r - temp.i * B(i,k).i, - z__1.i = temp.r * B(i,k).i + temp.i * B(i,k).r; - B(i,k).r = z__1.r, B(i,k).i = z__1.i; -/* L290: */ - } - } - i__1 = k - 1; - for (j = 1; j <= k-1; ++j) { - i__2 = j + k * a_dim1; - if (A(j,k).r != 0. || A(j,k).i != 0.) { - if (noconj) { - i__2 = j + k * a_dim1; - temp.r = A(j,k).r, temp.i = A(j,k).i; - } else { - d_cnjg(&z__1, &A(j,k)); - temp.r = z__1.r, temp.i = z__1.i; - } - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + j * b_dim1; - i__4 = i + j * b_dim1; - i__5 = i + k * b_dim1; - z__2.r = temp.r * B(i,k).r - temp.i * B(i,k) - .i, z__2.i = temp.r * B(i,k).i + - temp.i * B(i,k).r; - z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j) - .i - z__2.i; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L300: */ - } - } -/* L310: */ - } - if (alpha->r != 1. || alpha->i != 0.) { - i__1 = *m; - for (i = 1; i <= *m; ++i) { - i__2 = i + k * b_dim1; - i__3 = i + k * b_dim1; - z__1.r = alpha->r * B(i,k).r - alpha->i * B(i,k) - .i, z__1.i = alpha->r * B(i,k).i + - alpha->i * B(i,k).r; - B(i,k).r = z__1.r, B(i,k).i = z__1.i; -/* L320: */ - } - } -/* L330: */ - } - } else { - i__1 = *n; - for (k = 1; k <= *n; ++k) { - if (nounit) { - if (noconj) { - z_div(&z__1, &c_b1, &A(k,k)); - temp.r = z__1.r, temp.i = z__1.i; - } else { - d_cnjg(&z__2, &A(k,k)); - z_div(&z__1, &c_b1, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + k * b_dim1; - i__4 = i + k * b_dim1; - z__1.r = temp.r * B(i,k).r - temp.i * B(i,k).i, - z__1.i = temp.r * B(i,k).i + temp.i * B(i,k).r; - B(i,k).r = z__1.r, B(i,k).i = z__1.i; -/* L340: */ - } - } - i__2 = *n; - for (j = k + 1; j <= *n; ++j) { - i__3 = j + k * a_dim1; - if (A(j,k).r != 0. || A(j,k).i != 0.) { - if (noconj) { - i__3 = j + k * a_dim1; - temp.r = A(j,k).r, temp.i = A(j,k).i; - } else { - d_cnjg(&z__1, &A(j,k)); - temp.r = z__1.r, temp.i = z__1.i; - } - i__3 = *m; - for (i = 1; i <= *m; ++i) { - i__4 = i + j * b_dim1; - i__5 = i + j * b_dim1; - i__6 = i + k * b_dim1; - z__2.r = temp.r * B(i,k).r - temp.i * B(i,k) - .i, z__2.i = temp.r * B(i,k).i + - temp.i * B(i,k).r; - z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j) - .i - z__2.i; - B(i,j).r = z__1.r, B(i,j).i = z__1.i; -/* L350: */ - } - } -/* L360: */ - } - if (alpha->r != 1. || alpha->i != 0.) { - i__2 = *m; - for (i = 1; i <= *m; ++i) { - i__3 = i + k * b_dim1; - i__4 = i + k * b_dim1; - z__1.r = alpha->r * B(i,k).r - alpha->i * B(i,k) - .i, z__1.i = alpha->r * B(i,k).i + - alpha->i * B(i,k).r; - B(i,k).r = z__1.r, B(i,k).i = z__1.i; -/* L370: */ - } - } -/* L380: */ - } - } - } - } - - return 0; - -/* End of ZTRSM . */ - -} /* ztrsm_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,510 +0,0 @@ - -/* -- translated by f2c (version 19940927). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - -#include "f2c.h" - -/* Subroutine */ int ztrsv_(char *uplo, char *trans, char *diag, integer *n, - doublecomplex *a, integer *lda, doublecomplex *x, integer *incx) -{ - - - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - doublecomplex z__1, z__2, z__3; - - /* Builtin functions */ - void z_div(doublecomplex *, doublecomplex *, doublecomplex *), d_cnjg( - doublecomplex *, doublecomplex *); - - /* Local variables */ - static integer info; - static doublecomplex temp; - static integer i, j; - extern logical lsame_(char *, char *); - static integer ix, jx, kx; - extern /* Subroutine */ int xerbla_(char *, integer *); - static logical noconj, nounit; - - -/* Purpose - ======= - - ZTRSV solves one of the systems of equations - - A*x = b, or A'*x = b, or conjg( A' )*x = b, - - where b and x are n element vectors and A is an n by n unit, or - non-unit, upper or lower triangular matrix. - - No test for singularity or near-singularity is included in this - routine. Such tests must be performed before calling this routine. - - Parameters - ========== - - UPLO - CHARACTER*1. - On entry, UPLO specifies whether the matrix is an upper or - lower triangular matrix as follows: - - UPLO = 'U' or 'u' A is an upper triangular matrix. - - UPLO = 'L' or 'l' A is a lower triangular matrix. - - Unchanged on exit. - - TRANS - CHARACTER*1. - On entry, TRANS specifies the equations to be solved as - follows: - - TRANS = 'N' or 'n' A*x = b. - - TRANS = 'T' or 't' A'*x = b. - - TRANS = 'C' or 'c' conjg( A' )*x = b. - - Unchanged on exit. - - DIAG - CHARACTER*1. - On entry, DIAG specifies whether or not A is unit - triangular as follows: - - DIAG = 'U' or 'u' A is assumed to be unit triangular. - - DIAG = 'N' or 'n' A is not assumed to be unit - triangular. - - Unchanged on exit. - - N - INTEGER. - On entry, N specifies the order of the matrix A. - N must be at least zero. - Unchanged on exit. - - A - COMPLEX*16 array of DIMENSION ( LDA, n ). - Before entry with UPLO = 'U' or 'u', the leading n by n - upper triangular part of the array A must contain the upper - - triangular matrix and the strictly lower triangular part of - - A is not referenced. - Before entry with UPLO = 'L' or 'l', the leading n by n - lower triangular part of the array A must contain the lower - - triangular matrix and the strictly upper triangular part of - - A is not referenced. - Note that when DIAG = 'U' or 'u', the diagonal elements of - - A are not referenced either, but are assumed to be unity. - Unchanged on exit. - - LDA - INTEGER. - On entry, LDA specifies the first dimension of A as declared - - in the calling (sub) program. LDA must be at least - max( 1, n ). - Unchanged on exit. - - X - COMPLEX*16 array of dimension at least - ( 1 + ( n - 1 )*abs( INCX ) ). - Before entry, the incremented array X must contain the n - element right-hand side vector b. On exit, X is overwritten - - with the solution vector x. - - INCX - INTEGER. - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - Unchanged on exit. - - - Level 2 Blas routine. - - -- Written on 22-October-1986. - Jack Dongarra, Argonne National Lab. - Jeremy Du Croz, Nag Central Office. - Sven Hammarling, Nag Central Office. - Richard Hanson, Sandia National Labs. - - - - Test the input parameters. - - - Parameter adjustments - Function Body */ -#define X(I) x[(I)-1] - -#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] - - info = 0; - if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { - info = 1; - } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && - ! lsame_(trans, "C")) { - info = 2; - } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) { - info = 3; - } else if (*n < 0) { - info = 4; - } else if (*lda < max(1,*n)) { - info = 6; - } else if (*incx == 0) { - info = 8; - } - if (info != 0) { - xerbla_("ZTRSV ", &info); - return 0; - } - -/* Quick return if possible. */ - - if (*n == 0) { - return 0; - } - - noconj = lsame_(trans, "T"); - nounit = lsame_(diag, "N"); - -/* Set up the start point in X if the increment is not unity. This - will be ( N - 1 )*INCX too small for descending loops. */ - - if (*incx <= 0) { - kx = 1 - (*n - 1) * *incx; - } else if (*incx != 1) { - kx = 1; - } - -/* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - - if (lsame_(trans, "N")) { - -/* Form x := inv( A )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - i__1 = j; - if (X(j).r != 0. || X(j).i != 0.) { - if (nounit) { - i__1 = j; - z_div(&z__1, &X(j), &A(j,j)); - X(j).r = z__1.r, X(j).i = z__1.i; - } - i__1 = j; - temp.r = X(j).r, temp.i = X(j).i; - for (i = j - 1; i >= 1; --i) { - i__1 = i; - i__2 = i; - i__3 = i + j * a_dim1; - z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - z__1.r = X(i).r - z__2.r, z__1.i = X(i).i - - z__2.i; - X(i).r = z__1.r, X(i).i = z__1.i; -/* L10: */ - } - } -/* L20: */ - } - } else { - jx = kx + (*n - 1) * *incx; - for (j = *n; j >= 1; --j) { - i__1 = jx; - if (X(jx).r != 0. || X(jx).i != 0.) { - if (nounit) { - i__1 = jx; - z_div(&z__1, &X(jx), &A(j,j)); - X(jx).r = z__1.r, X(jx).i = z__1.i; - } - i__1 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - ix = jx; - for (i = j - 1; i >= 1; --i) { - ix -= *incx; - i__1 = ix; - i__2 = ix; - i__3 = i + j * a_dim1; - z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - z__1.r = X(ix).r - z__2.r, z__1.i = X(ix).i - - z__2.i; - X(ix).r = z__1.r, X(ix).i = z__1.i; -/* L30: */ - } - } - jx -= *incx; -/* L40: */ - } - } - } else { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - if (X(j).r != 0. || X(j).i != 0.) { - if (nounit) { - i__2 = j; - z_div(&z__1, &X(j), &A(j,j)); - X(j).r = z__1.r, X(j).i = z__1.i; - } - i__2 = j; - temp.r = X(j).r, temp.i = X(j).i; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - i__3 = i; - i__4 = i; - i__5 = i + j * a_dim1; - z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - z__1.r = X(i).r - z__2.r, z__1.i = X(i).i - - z__2.i; - X(i).r = z__1.r, X(i).i = z__1.i; -/* L50: */ - } - } -/* L60: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = jx; - if (X(jx).r != 0. || X(jx).i != 0.) { - if (nounit) { - i__2 = jx; - z_div(&z__1, &X(jx), &A(j,j)); - X(jx).r = z__1.r, X(jx).i = z__1.i; - } - i__2 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - ix = jx; - i__2 = *n; - for (i = j + 1; i <= *n; ++i) { - ix += *incx; - i__3 = ix; - i__4 = ix; - i__5 = i + j * a_dim1; - z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, - z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r; - z__1.r = X(ix).r - z__2.r, z__1.i = X(ix).i - - z__2.i; - X(ix).r = z__1.r, X(ix).i = z__1.i; -/* L70: */ - } - } - jx += *incx; -/* L80: */ - } - } - } - } else { - -/* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. */ - - if (lsame_(uplo, "U")) { - if (*incx == 1) { - i__1 = *n; - for (j = 1; j <= *n; ++j) { - i__2 = j; - temp.r = X(j).r, temp.i = X(j).i; - if (noconj) { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = i; - z__2.r = A(i,j).r * X(i).r - A(i,j).i * X( - i).i, z__2.i = A(i,j).r * X(i).i + - A(i,j).i * X(i).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L90: */ - } - if (nounit) { - z_div(&z__1, &temp, &A(j,j)); - temp.r = z__1.r, temp.i = z__1.i; - } - } else { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - d_cnjg(&z__3, &A(i,j)); - i__3 = i; - z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, - z__2.i = z__3.r * X(i).i + z__3.i * X( - i).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L100: */ - } - if (nounit) { - d_cnjg(&z__2, &A(j,j)); - z_div(&z__1, &temp, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - } - i__2 = j; - X(j).r = temp.r, X(j).i = temp.i; -/* L110: */ - } - } else { - jx = kx; - i__1 = *n; - for (j = 1; j <= *n; ++j) { - ix = kx; - i__2 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - if (noconj) { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - i__3 = i + j * a_dim1; - i__4 = ix; - z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X( - ix).i, z__2.i = A(i,j).r * X(ix).i + - A(i,j).i * X(ix).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; - ix += *incx; -/* L120: */ - } - if (nounit) { - z_div(&z__1, &temp, &A(j,j)); - temp.r = z__1.r, temp.i = z__1.i; - } - } else { - i__2 = j - 1; - for (i = 1; i <= j-1; ++i) { - d_cnjg(&z__3, &A(i,j)); - i__3 = ix; - z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, - z__2.i = z__3.r * X(ix).i + z__3.i * X( - ix).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; - ix += *incx; -/* L130: */ - } - if (nounit) { - d_cnjg(&z__2, &A(j,j)); - z_div(&z__1, &temp, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - } - i__2 = jx; - X(jx).r = temp.r, X(jx).i = temp.i; - jx += *incx; -/* L140: */ - } - } - } else { - if (*incx == 1) { - for (j = *n; j >= 1; --j) { - i__1 = j; - temp.r = X(j).r, temp.i = X(j).i; - if (noconj) { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - i__2 = i + j * a_dim1; - i__3 = i; - z__2.r = A(i,j).r * X(i).r - A(i,j).i * X( - i).i, z__2.i = A(i,j).r * X(i).i + - A(i,j).i * X(i).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L150: */ - } - if (nounit) { - z_div(&z__1, &temp, &A(j,j)); - temp.r = z__1.r, temp.i = z__1.i; - } - } else { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - d_cnjg(&z__3, &A(i,j)); - i__2 = i; - z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, - z__2.i = z__3.r * X(i).i + z__3.i * X( - i).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; -/* L160: */ - } - if (nounit) { - d_cnjg(&z__2, &A(j,j)); - z_div(&z__1, &temp, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - } - i__1 = j; - X(j).r = temp.r, X(j).i = temp.i; -/* L170: */ - } - } else { - kx += (*n - 1) * *incx; - jx = kx; - for (j = *n; j >= 1; --j) { - ix = kx; - i__1 = jx; - temp.r = X(jx).r, temp.i = X(jx).i; - if (noconj) { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - i__2 = i + j * a_dim1; - i__3 = ix; - z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X( - ix).i, z__2.i = A(i,j).r * X(ix).i + - A(i,j).i * X(ix).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; - ix -= *incx; -/* L180: */ - } - if (nounit) { - z_div(&z__1, &temp, &A(j,j)); - temp.r = z__1.r, temp.i = z__1.i; - } - } else { - i__1 = j + 1; - for (i = *n; i >= j+1; --i) { - d_cnjg(&z__3, &A(i,j)); - i__2 = ix; - z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, - z__2.i = z__3.r * X(ix).i + z__3.i * X( - ix).r; - z__1.r = temp.r - z__2.r, z__1.i = temp.i - - z__2.i; - temp.r = z__1.r, temp.i = z__1.i; - ix -= *incx; -/* L190: */ - } - if (nounit) { - d_cnjg(&z__2, &A(j,j)); - z_div(&z__1, &temp, &z__2); - temp.r = z__1.r, temp.i = z__1.i; - } - } - i__1 = jx; - X(jx).r = temp.r, X(jx).i = temp.i; - jx -= *incx; -/* L200: */ - } - } - } - } - - return 0; - -/* End of ZTRSV . */ - -} /* ztrsv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/EXAMPLE/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/EXAMPLE/Makefile --- hypre-2.11.2/src/FEI_mv/DSuperLU/EXAMPLE/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/EXAMPLE/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,136 +0,0 @@ -####################################################################### -# -# This makefile creates the example programs for the linear equation -# routines in SuperLU_DIST. -# -# The command -# make -# without any arguments creates all the example programs. -# The command -# make double -# creates double precision real example programs. -# The command -# make complex16 -# creates double precision complex example programs. -# -# The executable files are called -# double real: pddrive pddrive_ABglobal pddrive1 -# pddrive1_ABglobal pddrive2 pddrive3 pddrive4 -# double complex: pzdrive pzdrive_ABglobal pzdrive1 -# pzdrive1_ABglobal pzdrive2 pzdrive3 pzdrive4 -# -# Alternatively, you can create example programs individually by -# typing the command (for example) -# make pddrive -# -# To remove the object files after the executable files have been -# created, enter -# make clean -# -####################################################################### -include ../make.inc -INCLUDEDIR = -I../SRC - -DEXM = pddrive.o dcreate_matrix.o sp_ienv.o dreadhb.o -# pdgstrs_lsum_X1.o pdgstrf_X1.o -DEXM1 = pddrive1.o dcreate_matrix.o dreadhb.o -DEXM2 = pddrive2.o dcreate_matrix.o dreadhb.o -DEXM3 = pddrive3.o dcreate_matrix.o dreadhb.o -DEXM4 = pddrive4.o dcreate_matrix.o dreadhb.o -DEXMG = pddrive_ABglobal.o dreadhb.o -DEXMG1 = pddrive1_ABglobal.o dreadhb.o sp_ienv.o -DEXMG2 = pddrive2_ABglobal.o dreadhb.o -DEXMG3 = pddrive3_ABglobal.o dreadhb.o -DEXMG4 = pddrive4_ABglobal.o dreadhb.o -ZEXM = pzdrive.o zcreate_matrix.o zreadhb.o #pzgstrf_X1.o -ZEXM1 = pzdrive1.o zcreate_matrix.o zreadhb.o -ZEXM2 = pzdrive2.o zcreate_matrix.o zreadhb.o -ZEXM3 = pzdrive3.o zcreate_matrix.o zreadhb.o -ZEXM4 = pzdrive4.o zcreate_matrix.o zreadhb.o -ZEXMG = pzdrive_ABglobal.o zreadhb.o -ZEXMG1 = pzdrive1_ABglobal.o zreadhb.o -ZEXMG2 = pzdrive2_ABglobal.o zreadhb.o -ZEXMG3 = pzdrive3_ABglobal.o zreadhb.o -ZEXMG4 = pzdrive4_ABglobal.o zreadhb.o - -all: double complex16 - -double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ - pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \ - pddrive3_ABglobal pddrive4_ABglobal - -complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \ - pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \ - pzdrive3_ABglobal pzdrive4_ABglobal - -pddrive: $(DEXM) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@ - -pddrive1: $(DEXM1) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXM1) $(LIBS) -lm -o $@ - -pddrive2: $(DEXM2) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXM2) $(LIBS) -lm -o $@ - -pddrive3: $(DEXM3) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXM3) $(LIBS) -lm -o $@ - -pddrive4: $(DEXM4) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXM4) $(LIBS) -lm -o $@ - -pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@ - -pddrive1_ABglobal: $(DEXMG1) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXMG1) $(LIBS) -lm -o $@ - -pddrive2_ABglobal: $(DEXMG2) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXMG2) $(LIBS) -lm -o $@ - -pddrive3_ABglobal: $(DEXMG3) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXMG3) $(LIBS) -lm -o $@ - -pddrive4_ABglobal: $(DEXMG4) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXMG4) $(LIBS) -lm -o $@ - -pzdrive: $(ZEXM) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXM) $(LIBS) -lm -o $@ - -pzdrive1: $(ZEXM1) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXM1) $(LIBS) -lm -o $@ - -pzdrive2: $(ZEXM2) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXM2) $(LIBS) -lm -o $@ - -pzdrive3: $(ZEXM3) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXM3) $(LIBS) -lm -o $@ - -pzdrive4: $(ZEXM4) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXM4) $(LIBS) -lm -o $@ - -pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@ - -pzdrive1_ABglobal: $(ZEXMG1) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXMG1) $(LIBS) -lm -o $@ - -pzdrive2_ABglobal: $(ZEXMG2) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXMG2) $(LIBS) -lm -o $@ - -pzdrive3_ABglobal: $(ZEXMG3) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXMG3) $(LIBS) -lm -o $@ - -pzdrive4_ABglobal: $(ZEXMG4) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(ZEXMG4) $(LIBS) -lm -o $@ - -.c.o: - $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE) - -.f.o: - $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) - -clean: - rm -f *.o p[dz]drive p[dz]drive[1-9] \ - p[dz]drive_ABglobal p[dz]drive[1-9]_ABglobal - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamch.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamch.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,963 +0,0 @@ -#include -#define TRUE_ (1) -#define FALSE_ (0) -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) - -double dlamch_(char *cmach) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - Purpose - ======= - - DLAMCH determines double precision machine parameters. - - Arguments - ========= - - CMACH (input) CHARACTER*1 - Specifies the value to be returned by DLAMCH: - = 'E' or 'e', DLAMCH := eps - = 'S' or 's , DLAMCH := sfmin - = 'B' or 'b', DLAMCH := base - = 'P' or 'p', DLAMCH := eps*base - = 'N' or 'n', DLAMCH := t - = 'R' or 'r', DLAMCH := rnd - = 'M' or 'm', DLAMCH := emin - = 'U' or 'u', DLAMCH := rmin - = 'L' or 'l', DLAMCH := emax - = 'O' or 'o', DLAMCH := rmax - - where - - eps = relative machine precision - sfmin = safe minimum, such that 1/sfmin does not overflow - base = base of the machine - prec = eps*base - t = number of (base) digits in the mantissa - rnd = 1.0 when rounding occurs in addition, 0.0 otherwise - emin = minimum exponent before (gradual) underflow - rmin = underflow threshold - base**(emin-1) - emax = largest exponent before overflow - rmax = overflow threshold - (base**emax)*(1-eps) - - ===================================================================== -*/ - - static int first = TRUE_; - - /* System generated locals */ - int i__1; - double ret_val; - /* Builtin functions */ - double pow_di(double *, int *); - /* Local variables */ - static double base; - static int beta; - static double emin, prec, emax; - static int imin, imax; - static int lrnd; - static double rmin, rmax, t, rmach; - extern int lsame_(char *, char *); - static double small, sfmin; - extern /* Subroutine */ int dlamc2_(int *, int *, int *, - double *, int *, double *, int *, double *); - static int it; - static double rnd, eps; - - if (first) { - first = FALSE_; - dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); - base = (double) beta; - t = (double) it; - if (lrnd) { - rnd = 1.; - i__1 = 1 - it; - eps = pow_di(&base, &i__1) / 2; - } else { - rnd = 0.; - i__1 = 1 - it; - eps = pow_di(&base, &i__1); - } - prec = eps * base; - emin = (double) imin; - emax = (double) imax; - sfmin = rmin; - small = 1. / rmax; - if (small >= sfmin) { - - /* Use SMALL plus a bit, to avoid the possibility of rounding - causing overflow when computing 1/sfmin. */ - sfmin = small * (eps + 1.); - } - } - - if (lsame_(cmach, "E")) { - rmach = eps; - } else if (lsame_(cmach, "S")) { - rmach = sfmin; - } else if (lsame_(cmach, "B")) { - rmach = base; - } else if (lsame_(cmach, "P")) { - rmach = prec; - } else if (lsame_(cmach, "N")) { - rmach = t; - } else if (lsame_(cmach, "R")) { - rmach = rnd; - } else if (lsame_(cmach, "M")) { - rmach = emin; - } else if (lsame_(cmach, "U")) { - rmach = rmin; - } else if (lsame_(cmach, "L")) { - rmach = emax; - } else if (lsame_(cmach, "O")) { - rmach = rmax; - } - - ret_val = rmach; - return ret_val; - -/* End of DLAMCH */ - -} /* dlamch_ */ - - -/* Subroutine */ int dlamc1_(int *beta, int *t, int *rnd, int - *ieee1) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC1 determines the machine parameters given by BETA, T, RND, and - IEEE1. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - IEEE1 (output) INT - Specifies whether rounding appears to be done in the IEEE - 'round to nearest' style. - - Further Details - =============== - - The routine is based on the routine ENVRON by Malcolm and - incorporates suggestions by Gentleman and Marovich. See - - Malcolm M. A. (1972) Algorithms to reveal properties of - floating-point arithmetic. Comms. of the ACM, 15, 949-951. - - Gentleman W. M. and Marovich S. B. (1974) More on algorithms - that reveal properties of floating point arithmetic units. - Comms. of the ACM, 17, 276-277. - - ===================================================================== -*/ - /* Initialized data */ - static int first = TRUE_; - /* System generated locals */ - double d__1, d__2; - /* Local variables */ - static int lrnd; - static double a, b, c, f; - static int lbeta; - static double savec; - extern double dlamc3_(double *, double *); - static int lieee1; - static double t1, t2; - static int lt; - static double one, qtr; - - if (first) { - first = FALSE_; - one = 1.; - -/* LBETA, LIEEE1, LT and LRND are the local values of BE -TA, - IEEE1, T and RND. - - Throughout this routine we use the function DLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - Compute a = 2.0**m with the smallest positive integer m s -uch - that - - fl( a + 1.0 ) = a. */ - - a = 1.; - c = 1.; - -/* + WHILE( C.EQ.ONE )LOOP */ -L10: - if (c == one) { - a *= 2; - c = dlamc3_(&a, &one); - d__1 = -a; - c = dlamc3_(&c, &d__1); - goto L10; - } -/* + END WHILE - - Now compute b = 2.0**m with the smallest positive integer -m - such that - - fl( a + b ) .gt. a. */ - - b = 1.; - c = dlamc3_(&a, &b); - -/* + WHILE( C.EQ.A )LOOP */ -L20: - if (c == a) { - b *= 2; - c = dlamc3_(&a, &b); - goto L20; - } -/* + END WHILE - - Now compute the base. a and c are neighbouring floating po -int - numbers in the interval ( beta**t, beta**( t + 1 ) ) and - so - their difference is beta. Adding 0.25 to c is to ensure that - it - is truncated to beta and not ( beta - 1 ). */ - - qtr = one / 4; - savec = c; - d__1 = -a; - c = dlamc3_(&c, &d__1); - lbeta = (int) (c + qtr); - -/* Now determine whether rounding or chopping occurs, by addin -g a - bit less than beta/2 and a bit more than beta/2 to - a. */ - - b = (double) lbeta; - d__1 = b / 2; - d__2 = -b / 100; - f = dlamc3_(&d__1, &d__2); - c = dlamc3_(&f, &a); - if (c == a) { - lrnd = TRUE_; - } else { - lrnd = FALSE_; - } - d__1 = b / 2; - d__2 = b / 100; - f = dlamc3_(&d__1, &d__2); - c = dlamc3_(&f, &a); - if (lrnd && c == a) { - lrnd = FALSE_; - } - -/* Try and decide whether rounding is done in the IEEE 'round - to - nearest' style. B/2 is half a unit in the last place of the -two - numbers A and SAVEC. Furthermore, A is even, i.e. has last -bit - zero, and SAVEC is odd. Thus adding B/2 to A should not cha -nge - A, but adding B/2 to SAVEC should change SAVEC. */ - - d__1 = b / 2; - t1 = dlamc3_(&d__1, &a); - d__1 = b / 2; - t2 = dlamc3_(&d__1, &savec); - lieee1 = t1 == a && t2 > savec && lrnd; - -/* Now find the mantissa, t. It should be the integer part - of - log to the base beta of a, however it is safer to determine - t - by powering. So we find t as the smallest positive integer -for - which - - fl( beta**t + 1.0 ) = 1.0. */ - - lt = 0; - a = 1.; - c = 1.; - -/* + WHILE( C.EQ.ONE )LOOP */ -L30: - if (c == one) { - ++lt; - a *= lbeta; - c = dlamc3_(&a, &one); - d__1 = -a; - c = dlamc3_(&c, &d__1); - goto L30; - } -/* + END WHILE */ - - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *ieee1 = lieee1; - return 0; - -/* End of DLAMC1 */ - -} /* dlamc1_ */ - - -/* Subroutine */ int dlamc2_(int *beta, int *t, int *rnd, - double *eps, int *emin, double *rmin, int *emax, - double *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC2 determines the machine parameters specified in its argument - list. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - EPS (output) DOUBLE PRECISION - The smallest positive number such that - - fl( 1.0 - EPS ) .LT. 1.0, - - where fl denotes the computed value. - - EMIN (output) INT - The minimum exponent before (gradual) underflow occurs. - - RMIN (output) DOUBLE PRECISION - The smallest normalized number for the machine, given by - BASE**( EMIN - 1 ), where BASE is the floating point value - - of BETA. - - EMAX (output) INT - The maximum exponent before overflow occurs. - - RMAX (output) DOUBLE PRECISION - The largest positive number for the machine, given by - BASE**EMAX * ( 1 - EPS ), where BASE is the floating point - - value of BETA. - - Further Details - =============== - - The computation of EPS is based on a routine PARANOIA by - W. Kahan of the University of California at Berkeley. - - ===================================================================== -*/ - /* Table of constant values */ - static int c__1 = 1; - - /* Initialized data */ - static int first = TRUE_; - static int iwarn = FALSE_; - /* System generated locals */ - int i__1; - double d__1, d__2, d__3, d__4, d__5; - /* Builtin functions */ - double pow_di(double *, int *); - /* Local variables */ - static int ieee; - static double half; - static int lrnd; - static double leps, zero, a, b, c; - static int i, lbeta; - static double rbase; - static int lemin, lemax, gnmin; - static double small; - static int gpmin; - static double third, lrmin, lrmax, sixth; - extern /* Subroutine */ int dlamc1_(int *, int *, int *, - int *); - extern double dlamc3_(double *, double *); - static int lieee1; - extern /* Subroutine */ int dlamc4_(int *, double *, int *), - dlamc5_(int *, int *, int *, int *, int *, - double *); - static int lt, ngnmin, ngpmin; - static double one, two; - - if (first) { - first = FALSE_; - zero = 0.; - one = 1.; - two = 2.; - -/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values - of - BETA, T, RND, EPS, EMIN and RMIN. - - Throughout this routine we use the function DLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. -*/ - - dlamc1_(&lbeta, <, &lrnd, &lieee1); - -/* Start to find EPS. */ - - b = (double) lbeta; - i__1 = -lt; - a = pow_di(&b, &i__1); - leps = a; - -/* Try some tricks to see whether or not this is the correct E -PS. */ - - b = two / 3; - half = one / 2; - d__1 = -half; - sixth = dlamc3_(&b, &d__1); - third = dlamc3_(&sixth, &sixth); - d__1 = -half; - b = dlamc3_(&third, &d__1); - b = dlamc3_(&b, &sixth); - b = abs(b); - if (b < leps) { - b = leps; - } - - leps = 1.; - -/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ -L10: - if (leps > b && b > zero) { - leps = b; - d__1 = half * leps; -/* Computing 5th power */ - d__3 = two, d__4 = d__3, d__3 *= d__3; -/* Computing 2nd power */ - d__5 = leps; - d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5); - c = dlamc3_(&d__1, &d__2); - d__1 = -c; - c = dlamc3_(&half, &d__1); - b = dlamc3_(&half, &c); - d__1 = -b; - c = dlamc3_(&half, &d__1); - b = dlamc3_(&half, &c); - goto L10; - } -/* + END WHILE */ - - if (a < leps) { - leps = a; - } - -/* Computation of EPS complete. - - Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 -)). - Keep dividing A by BETA until (gradual) underflow occurs. T -his - is detected when we cannot recover the previous A. */ - - rbase = one / lbeta; - small = one; - for (i = 1; i <= 3; ++i) { - d__1 = small * rbase; - small = dlamc3_(&d__1, &zero); -/* L20: */ - } - a = dlamc3_(&one, &small); - dlamc4_(&ngpmin, &one, &lbeta); - d__1 = -one; - dlamc4_(&ngnmin, &d__1, &lbeta); - dlamc4_(&gpmin, &a, &lbeta); - d__1 = -a; - dlamc4_(&gnmin, &d__1, &lbeta); - ieee = FALSE_; - - if (ngpmin == ngnmin && gpmin == gnmin) { - if (ngpmin == gpmin) { - lemin = ngpmin; -/* ( Non twos-complement machines, no gradual under -flow; - e.g., VAX ) */ - } else if (gpmin - ngpmin == 3) { - lemin = ngpmin - 1 + lt; - ieee = TRUE_; -/* ( Non twos-complement machines, with gradual und -erflow; - e.g., IEEE standard followers ) */ - } else { - lemin = min(ngpmin,gpmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if (ngpmin == gpmin && ngnmin == gnmin) { - if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { - lemin = max(ngpmin,ngnmin); -/* ( Twos-complement machines, no gradual underflow -; - e.g., CYBER 205 ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) - { - if (gpmin - min(ngpmin,ngnmin) == 3) { - lemin = max(ngpmin,ngnmin) - 1 + lt; -/* ( Twos-complement machines with gradual underflo -w; - no known machine ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else { -/* Computing MIN */ - i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); - lemin = min(i__1,gnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } -/* ** - Comment out this if block if EMIN is ok */ - if (iwarn) { - first = TRUE_; - printf("\n\n WARNING. The value EMIN may be incorrect:- "); - printf("EMIN = %8i\n",lemin); - printf("If, after inspection, the value EMIN looks acceptable"); - printf("please comment out \n the IF block as marked within the"); - printf("code of routine DLAMC2, \n otherwise supply EMIN"); - printf("explicitly.\n"); - } -/* ** - - Assume IEEE arithmetic if we found denormalised numbers abo -ve, - or if arithmetic seems to round in the IEEE style, determi -ned - in routine DLAMC1. A true IEEE machine should have both thi -ngs - true; however, faulty machines may have one or the other. */ - - ieee = ieee || lieee1; - -/* Compute RMIN by successive division by BETA. We could comp -ute - RMIN as BASE**( EMIN - 1 ), but some machines underflow dur -ing - this computation. */ - - lrmin = 1.; - i__1 = 1 - lemin; - for (i = 1; i <= 1-lemin; ++i) { - d__1 = lrmin * rbase; - lrmin = dlamc3_(&d__1, &zero); -/* L30: */ - } - -/* Finally, call DLAMC5 to compute EMAX and RMAX. */ - - dlamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *eps = leps; - *emin = lemin; - *rmin = lrmin; - *emax = lemax; - *rmax = lrmax; - - return 0; - - -/* End of DLAMC2 */ - -} /* dlamc2_ */ - - -double dlamc3_(double *a, double *b) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC3 is intended to force A and B to be stored prior to doing - - the addition of A and B , for use in situations where optimizers - - might hold one of these in a register. - - Arguments - ========= - - A, B (input) DOUBLE PRECISION - The values A and B. - - ===================================================================== -*/ -/* >>Start of File<< - System generated locals */ - double ret_val; - - ret_val = *a + *b; - - return ret_val; - -/* End of DLAMC3 */ - -} /* dlamc3_ */ - - -/* Subroutine */ int dlamc4_(int *emin, double *start, int *base) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC4 is a service routine for DLAMC2. - - Arguments - ========= - - EMIN (output) EMIN - The minimum exponent before (gradual) underflow, computed by - - setting A = START and dividing by BASE until the previous A - can not be recovered. - - START (input) DOUBLE PRECISION - The starting point for determining EMIN. - - BASE (input) INT - The base of the machine. - - ===================================================================== -*/ - /* System generated locals */ - int i__1; - double d__1; - /* Local variables */ - static double zero, a; - static int i; - static double rbase, b1, b2, c1, c2, d1, d2; - extern double dlamc3_(double *, double *); - static double one; - - a = *start; - one = 1.; - rbase = one / *base; - zero = 0.; - *emin = 1; - d__1 = a * rbase; - b1 = dlamc3_(&d__1, &zero); - c1 = a; - c2 = a; - d1 = a; - d2 = a; -/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. - $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ -L10: - if (c1 == a && c2 == a && d1 == a && d2 == a) { - --(*emin); - a = b1; - d__1 = a / *base; - b1 = dlamc3_(&d__1, &zero); - d__1 = b1 * *base; - c1 = dlamc3_(&d__1, &zero); - d1 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d1 += b1; -/* L20: */ - } - d__1 = a * rbase; - b2 = dlamc3_(&d__1, &zero); - d__1 = b2 / rbase; - c2 = dlamc3_(&d__1, &zero); - d2 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d2 += b2; -/* L30: */ - } - goto L10; - } -/* + END WHILE */ - - return 0; - -/* End of DLAMC4 */ - -} /* dlamc4_ */ - - -/* Subroutine */ int dlamc5_(int *beta, int *p, int *emin, - int *ieee, int *emax, double *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC5 attempts to compute RMAX, the largest machine floating-point - number, without overflow. It assumes that EMAX + abs(EMIN) sum - approximately to a power of 2. It will fail on machines where this - assumption does not hold, for example, the Cyber 205 (EMIN = -28625, - - EMAX = 28718). It will also fail if the value supplied for EMIN is - too large (i.e. too close to zero), probably with overflow. - - Arguments - ========= - - BETA (input) INT - The base of floating-point arithmetic. - - P (input) INT - The number of base BETA digits in the mantissa of a - floating-point value. - - EMIN (input) INT - The minimum exponent before (gradual) underflow. - - IEEE (input) INT - A int flag specifying whether or not the arithmetic - system is thought to comply with the IEEE standard. - - EMAX (output) INT - The largest exponent before overflow - - RMAX (output) DOUBLE PRECISION - The largest machine floating-point number. - - ===================================================================== - - - - First compute LEXP and UEXP, two powers of 2 that bound - abs(EMIN). We then assume that EMAX + abs(EMIN) will sum - approximately to the bound that is closest to abs(EMIN). - (EMAX is the exponent of the required number RMAX). */ - /* Table of constant values */ - static double c_b5 = 0.; - - /* System generated locals */ - int i__1; - double d__1; - /* Local variables */ - static int lexp; - static double oldy; - static int uexp, i; - static double y, z; - static int nbits; - extern double dlamc3_(double *, double *); - static double recbas; - static int exbits, expsum, try__; - - - - lexp = 1; - exbits = 1; -L10: - try__ = lexp << 1; - if (try__ <= -(*emin)) { - lexp = try__; - ++exbits; - goto L10; - } - if (lexp == -(*emin)) { - uexp = lexp; - } else { - uexp = try__; - ++exbits; - } - -/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater - than or equal to EMIN. EXBITS is the number of bits needed to - store the exponent. */ - - if (uexp + *emin > -lexp - *emin) { - expsum = lexp << 1; - } else { - expsum = uexp << 1; - } - -/* EXPSUM is the exponent range, approximately equal to - EMAX - EMIN + 1 . */ - - *emax = expsum + *emin - 1; - nbits = exbits + 1 + *p; - -/* NBITS is the total number of bits needed to store a - floating-point number. */ - - if (nbits % 2 == 1 && *beta == 2) { - -/* Either there are an odd number of bits used to store a - floating-point number, which is unlikely, or some bits are - - not used in the representation of numbers, which is possible -, - (e.g. Cray machines) or the mantissa has an implicit bit, - (e.g. IEEE machines, Dec Vax machines), which is perhaps the - - most likely. We have to assume the last alternative. - If this is true, then we need to reduce EMAX by one because - - there must be some way of representing zero in an implicit-b -it - system. On machines like Cray, we are reducing EMAX by one - - unnecessarily. */ - - --(*emax); - } - - if (*ieee) { - -/* Assume we are on an IEEE machine which reserves one exponent - - for infinity and NaN. */ - - --(*emax); - } - -/* Now create RMAX, the largest machine number, which should - be equal to (1.0 - BETA**(-P)) * BETA**EMAX . - - First compute 1.0 - BETA**(-P), being careful that the - result is less than 1.0 . */ - - recbas = 1. / *beta; - z = *beta - 1.; - y = 0.; - i__1 = *p; - for (i = 1; i <= *p; ++i) { - z *= recbas; - if (y < 1.) { - oldy = y; - } - y = dlamc3_(&y, &z); -/* L20: */ - } - if (y >= 1.) { - y = oldy; - } - -/* Now multiply by BETA**EMAX to get RMAX. */ - - i__1 = *emax; - for (i = 1; i <= *emax; ++i) { - d__1 = y * *beta; - y = dlamc3_(&d__1, &c_b5); -/* L30: */ - } - - *rmax = y; - return 0; - -/* End of DLAMC5 */ - -} /* dlamc5_ */ - -double pow_di(double *ap, int *bp) -{ - double pow, x; - int n; - - pow = 1; - x = *ap; - n = *bp; - - if(n != 0){ - if(n < 0) { - n = -n; - x = 1/x; - } - for( ; ; ) { - if(n & 01) pow *= x; - if(n >>= 1) x *= x; - else break; - } - } - return(pow); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,34 +0,0 @@ -#include - -main() -{ - /* Local variables */ - double base, emin, prec, emax, rmin, rmax, t, sfmin; - extern double dlamch_(char *); - double rnd, eps; - - eps = dlamch_("Epsilon"); - sfmin = dlamch_("Safe minimum"); - base = dlamch_("Base"); - prec = dlamch_("Precision"); - t = dlamch_("Number of digits in mantissa"); - rnd = dlamch_("Rounding mode"); - emin = dlamch_("Minnimum exponent"); - rmin = dlamch_("Underflow threshold"); - emax = dlamch_("Largest exponent"); - rmax = dlamch_("Overflow threshold"); - - printf(" Epsilon = %e\n", eps); - printf(" Safe minimum = %e\n", sfmin); - printf(" Base = %.0f\n", base); - printf(" Precision = %e\n", prec); - printf(" Number of digits in mantissa = %.0f\n", t); - printf(" Rounding mode = %.0f\n", rnd); - printf(" Minimum exponent = %.0f\n", emin); - printf(" Underflow threshold = %e\n", rmin); - printf(" Largest exponent = %.0f\n", emax); - printf(" Overflow threshold = %e\n", rmax); - printf(" Reciprocal of safe minimum = %e\n", 1./sfmin); - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/install.csh hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/install.csh --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/install.csh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/install.csh 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -#! /bin/csh - -set ofile = install.out # output file - -echo '---- SINGLE PRECISION' >! $ofile -./testslamch >> $ofile -echo '' >> $ofile -echo ---- DOUBLE PRECISION >> $ofile -./testdlamch >> $ofile -echo '' >> $ofile -echo ---- TIMER >> $ofile -./testtimer >> $ofile - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/lsame.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/lsame.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/lsame.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/lsame.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,70 +0,0 @@ -int lsame_(char *ca, char *cb) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - Purpose - ======= - - LSAME returns .TRUE. if CA is the same letter as CB regardless of case. - - Arguments - ========= - - CA (input) CHARACTER*1 - CB (input) CHARACTER*1 - CA and CB specify the single characters to be compared. - - ===================================================================== -*/ - - /* System generated locals */ - int ret_val; - - /* Local variables */ - int inta, intb, zcode; - - ret_val = *(unsigned char *)ca == *(unsigned char *)cb; - if (ret_val) { - return ret_val; - } - - /* Now test for equivalence if both characters are alphabetic. */ - - zcode = 'Z'; - - /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime - machines, on which ICHAR returns a value with bit 8 set. - ICHAR('A') on Prime machines returns 193 which is the same as - ICHAR('A') on an EBCDIC machine. */ - - inta = *(unsigned char *)ca; - intb = *(unsigned char *)cb; - - if (zcode == 90 || zcode == 122) { - /* ASCII is assumed - ZCODE is the ASCII code of either lower or - upper case 'Z'. */ - if (inta >= 97 && inta <= 122) inta += -32; - if (intb >= 97 && intb <= 122) intb += -32; - - } else if (zcode == 233 || zcode == 169) { - /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or - upper case 'Z'. */ - if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta - >= 162 && inta <= 169) - inta += 64; - if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb - >= 162 && intb <= 169) - intb += 64; - } else if (zcode == 218 || zcode == 250) { - /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code - plus 128 of either lower or upper case 'Z'. */ - if (inta >= 225 && inta <= 250) inta += -32; - if (intb >= 225 && intb <= 250) intb += -32; - } - ret_val = inta == intb; - return ret_val; - -} /* lsame_ */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/Makefile --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -include ../make.inc - -all: testdlamch testslamch testtimer #install.out - -testdlamch: dlamch.o lsame.o dlamchtst.o - $(LOADER) $(LOADOPTS) -o testdlamch dlamch.o lsame.o dlamchtst.o - -testslamch: slamch.o lsame.o slamchtst.o - $(LOADER) $(LOADOPTS) -o testslamch slamch.o lsame.o slamchtst.o - -testtimer: superlu_timer.o timertst.o - $(LOADER) $(LOADOPTS) -o testtimer superlu_timer.o timertst.o \ - $(MPILIB) - -install.out: - @echo Testing machines parameters and timer - csh install.csh - -slamch.o: slamch.c ; $(CC) $(NOOPTS) -c $< -dlamch.o: dlamch.c ; $(CC) $(NOOPTS) -c $< -superlu_timer.o: superlu_timer.c; $(CC) $(NOOPTS) -c $< - -.c.o: - $(CC) $(CFLAGS) -c $< - -clean: - rm -f *.o test* *.out diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamch.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamch.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,982 +0,0 @@ -#include -#define TRUE_ (1) -#define FALSE_ (0) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define dabs(x) (double)abs(x) - -double slamch_(char *cmach) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMCH determines single precision machine parameters. - - Arguments - ========= - - CMACH (input) CHARACTER*1 - Specifies the value to be returned by SLAMCH: - = 'E' or 'e', SLAMCH := eps - = 'S' or 's , SLAMCH := sfmin - = 'B' or 'b', SLAMCH := base - = 'P' or 'p', SLAMCH := eps*base - = 'N' or 'n', SLAMCH := t - = 'R' or 'r', SLAMCH := rnd - = 'M' or 'm', SLAMCH := emin - = 'U' or 'u', SLAMCH := rmin - = 'L' or 'l', SLAMCH := emax - = 'O' or 'o', SLAMCH := rmax - - where - - eps = relative machine precision - sfmin = safe minimum, such that 1/sfmin does not overflow - base = base of the machine - prec = eps*base - t = number of (base) digits in the mantissa - rnd = 1.0 when rounding occurs in addition, 0.0 otherwise - emin = minimum exponent before (gradual) underflow - rmin = underflow threshold - base**(emin-1) - emax = largest exponent before overflow - rmax = overflow threshold - (base**emax)*(1-eps) - - ===================================================================== -*/ -/* >>Start of File<< - Initialized data */ - static int first = TRUE_; - /* System generated locals */ - int i__1; - float ret_val; - /* Builtin functions */ - double pow_ri(float *, int *); - /* Local variables */ - static float base; - static int beta; - static float emin, prec, emax; - static int imin, imax; - static int lrnd; - static float rmin, rmax, t, rmach; - extern int lsame_(char *, char *); - static float small, sfmin; - extern /* Subroutine */ int slamc2_(int *, int *, int *, float - *, int *, float *, int *, float *); - static int it; - static float rnd, eps; - - - - if (first) { - first = FALSE_; - slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); - base = (float) beta; - t = (float) it; - if (lrnd) { - rnd = 1.f; - i__1 = 1 - it; - eps = pow_ri(&base, &i__1) / 2; - } else { - rnd = 0.f; - i__1 = 1 - it; - eps = pow_ri(&base, &i__1); - } - prec = eps * base; - emin = (float) imin; - emax = (float) imax; - sfmin = rmin; - small = 1.f / rmax; - if (small >= sfmin) { - -/* Use SMALL plus a bit, to avoid the possibility of rou -nding - causing overflow when computing 1/sfmin. */ - - sfmin = small * (eps + 1.f); - } - } - - if (lsame_(cmach, "E")) { - rmach = eps; - } else if (lsame_(cmach, "S")) { - rmach = sfmin; - } else if (lsame_(cmach, "B")) { - rmach = base; - } else if (lsame_(cmach, "P")) { - rmach = prec; - } else if (lsame_(cmach, "N")) { - rmach = t; - } else if (lsame_(cmach, "R")) { - rmach = rnd; - } else if (lsame_(cmach, "M")) { - rmach = emin; - } else if (lsame_(cmach, "U")) { - rmach = rmin; - } else if (lsame_(cmach, "L")) { - rmach = emax; - } else if (lsame_(cmach, "O")) { - rmach = rmax; - } - - ret_val = rmach; - return ret_val; - -/* End of SLAMCH */ - -} /* slamch_ */ - - -/* Subroutine */ int slamc1_(int *beta, int *t, int *rnd, int - *ieee1) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC1 determines the machine parameters given by BETA, T, RND, and - IEEE1. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - IEEE1 (output) INT - Specifies whether rounding appears to be done in the IEEE - 'round to nearest' style. - - Further Details - =============== - - The routine is based on the routine ENVRON by Malcolm and - incorporates suggestions by Gentleman and Marovich. See - - Malcolm M. A. (1972) Algorithms to reveal properties of - floating-point arithmetic. Comms. of the ACM, 15, 949-951. - - Gentleman W. M. and Marovich S. B. (1974) More on algorithms - that reveal properties of floating point arithmetic units. - Comms. of the ACM, 17, 276-277. - - ===================================================================== -*/ - /* Initialized data */ - static int first = TRUE_; - /* System generated locals */ - float r__1, r__2; - /* Local variables */ - static int lrnd; - static float a, b, c, f; - static int lbeta; - static float savec; - static int lieee1; - static float t1, t2; - extern double slamc3_(float *, float *); - static int lt; - static float one, qtr; - - - - if (first) { - first = FALSE_; - one = 1.f; - -/* LBETA, LIEEE1, LT and LRND are the local values of BE -TA, - IEEE1, T and RND. - - Throughout this routine we use the function SLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - Compute a = 2.0**m with the smallest positive integer m s -uch - that - - fl( a + 1.0 ) = a. */ - - a = 1.f; - c = 1.f; - -/* + WHILE( C.EQ.ONE )LOOP */ -L10: - if (c == one) { - a *= 2; - c = slamc3_(&a, &one); - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - goto L10; - } -/* + END WHILE - - Now compute b = 2.0**m with the smallest positive integer -m - such that - - fl( a + b ) .gt. a. */ - - b = 1.f; - c = slamc3_(&a, &b); - -/* + WHILE( C.EQ.A )LOOP */ -L20: - if (c == a) { - b *= 2; - c = slamc3_(&a, &b); - goto L20; - } -/* + END WHILE - - Now compute the base. a and c are neighbouring floating po -int - numbers in the interval ( beta**t, beta**( t + 1 ) ) and - so - their difference is beta. Adding 0.25 to c is to ensure that - it - is truncated to beta and not ( beta - 1 ). */ - - qtr = one / 4; - savec = c; - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - lbeta = c + qtr; - -/* Now determine whether rounding or chopping occurs, by addin -g a - bit less than beta/2 and a bit more than beta/2 to - a. */ - - b = (float) lbeta; - r__1 = b / 2; - r__2 = -(double)b / 100; - f = slamc3_(&r__1, &r__2); - c = slamc3_(&f, &a); - if (c == a) { - lrnd = TRUE_; - } else { - lrnd = FALSE_; - } - r__1 = b / 2; - r__2 = b / 100; - f = slamc3_(&r__1, &r__2); - c = slamc3_(&f, &a); - if (lrnd && c == a) { - lrnd = FALSE_; - } - -/* Try and decide whether rounding is done in the IEEE 'round - to - nearest' style. B/2 is half a unit in the last place of the -two - numbers A and SAVEC. Furthermore, A is even, i.e. has last -bit - zero, and SAVEC is odd. Thus adding B/2 to A should not cha -nge - A, but adding B/2 to SAVEC should change SAVEC. */ - - r__1 = b / 2; - t1 = slamc3_(&r__1, &a); - r__1 = b / 2; - t2 = slamc3_(&r__1, &savec); - lieee1 = t1 == a && t2 > savec && lrnd; - -/* Now find the mantissa, t. It should be the integer part - of - log to the base beta of a, however it is safer to determine - t - by powering. So we find t as the smallest positive integer -for - which - - fl( beta**t + 1.0 ) = 1.0. */ - - lt = 0; - a = 1.f; - c = 1.f; - -/* + WHILE( C.EQ.ONE )LOOP */ -L30: - if (c == one) { - ++lt; - a *= lbeta; - c = slamc3_(&a, &one); - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - goto L30; - } -/* + END WHILE */ - - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *ieee1 = lieee1; - return 0; - -/* End of SLAMC1 */ - -} /* slamc1_ */ - - -/* Subroutine */ int slamc2_(int *beta, int *t, int *rnd, float * - eps, int *emin, float *rmin, int *emax, float *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC2 determines the machine parameters specified in its argument - list. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - EPS (output) FLOAT - The smallest positive number such that - - fl( 1.0 - EPS ) .LT. 1.0, - - where fl denotes the computed value. - - EMIN (output) INT - The minimum exponent before (gradual) underflow occurs. - - RMIN (output) FLOAT - The smallest normalized number for the machine, given by - BASE**( EMIN - 1 ), where BASE is the floating point value - - of BETA. - - EMAX (output) INT - The maximum exponent before overflow occurs. - - RMAX (output) FLOAT - The largest positive number for the machine, given by - BASE**EMAX * ( 1 - EPS ), where BASE is the floating point - - value of BETA. - - Further Details - =============== - - The computation of EPS is based on a routine PARANOIA by - W. Kahan of the University of California at Berkeley. - - ===================================================================== -*/ - /* Table of constant values */ - static int c__1 = 1; - - /* Initialized data */ - static int first = TRUE_; - static int iwarn = FALSE_; - /* System generated locals */ - int i__1; - float r__1, r__2, r__3, r__4, r__5; - /* Builtin functions */ - double pow_ri(float *, int *); - /* Local variables */ - static int ieee; - static float half; - static int lrnd; - static float leps, zero, a, b, c; - static int i, lbeta; - static float rbase; - static int lemin, lemax, gnmin; - static float small; - static int gpmin; - static float third, lrmin, lrmax, sixth; - static int lieee1; - extern /* Subroutine */ int slamc1_(int *, int *, int *, - int *); - extern double slamc3_(float *, float *); - extern /* Subroutine */ int slamc4_(int *, float *, int *), - slamc5_(int *, int *, int *, int *, int *, - float *); - static int lt, ngnmin, ngpmin; - static float one, two; - - - - if (first) { - first = FALSE_; - zero = 0.f; - one = 1.f; - two = 2.f; - -/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values - of - BETA, T, RND, EPS, EMIN and RMIN. - - Throughout this routine we use the function SLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. -*/ - - slamc1_(&lbeta, <, &lrnd, &lieee1); - -/* Start to find EPS. */ - - b = (float) lbeta; - i__1 = -lt; - a = pow_ri(&b, &i__1); - leps = a; - -/* Try some tricks to see whether or not this is the correct E -PS. */ - - b = two / 3; - half = one / 2; - r__1 = -(double)half; - sixth = slamc3_(&b, &r__1); - third = slamc3_(&sixth, &sixth); - r__1 = -(double)half; - b = slamc3_(&third, &r__1); - b = slamc3_(&b, &sixth); - b = dabs(b); - if (b < leps) { - b = leps; - } - - leps = 1.f; - -/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ -L10: - if (leps > b && b > zero) { - leps = b; - r__1 = half * leps; -/* Computing 5th power */ - r__3 = two, r__4 = r__3, r__3 *= r__3; -/* Computing 2nd power */ - r__5 = leps; - r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5); - c = slamc3_(&r__1, &r__2); - r__1 = -(double)c; - c = slamc3_(&half, &r__1); - b = slamc3_(&half, &c); - r__1 = -(double)b; - c = slamc3_(&half, &r__1); - b = slamc3_(&half, &c); - goto L10; - } -/* + END WHILE */ - - if (a < leps) { - leps = a; - } - -/* Computation of EPS complete. - - Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 -)). - Keep dividing A by BETA until (gradual) underflow occurs. T -his - is detected when we cannot recover the previous A. */ - - rbase = one / lbeta; - small = one; - for (i = 1; i <= 3; ++i) { - r__1 = small * rbase; - small = slamc3_(&r__1, &zero); -/* L20: */ - } - a = slamc3_(&one, &small); - slamc4_(&ngpmin, &one, &lbeta); - r__1 = -(double)one; - slamc4_(&ngnmin, &r__1, &lbeta); - slamc4_(&gpmin, &a, &lbeta); - r__1 = -(double)a; - slamc4_(&gnmin, &r__1, &lbeta); - ieee = FALSE_; - - if (ngpmin == ngnmin && gpmin == gnmin) { - if (ngpmin == gpmin) { - lemin = ngpmin; -/* ( Non twos-complement machines, no gradual under -flow; - e.g., VAX ) */ - } else if (gpmin - ngpmin == 3) { - lemin = ngpmin - 1 + lt; - ieee = TRUE_; -/* ( Non twos-complement machines, with gradual und -erflow; - e.g., IEEE standard followers ) */ - } else { - lemin = min(ngpmin,gpmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if (ngpmin == gpmin && ngnmin == gnmin) { - if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { - lemin = max(ngpmin,ngnmin); -/* ( Twos-complement machines, no gradual underflow -; - e.g., CYBER 205 ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) - { - if (gpmin - min(ngpmin,ngnmin) == 3) { - lemin = max(ngpmin,ngnmin) - 1 + lt; -/* ( Twos-complement machines with gradual underflo -w; - no known machine ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else { -/* Computing MIN */ - i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); - lemin = min(i__1,gnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } -/* ** - Comment out this if block if EMIN is ok */ - if (iwarn) { - first = TRUE_; - printf("\n\n WARNING. The value EMIN may be incorrect:- "); - printf("EMIN = %8i\n",lemin); - printf("If, after inspection, the value EMIN looks acceptable"); - printf("please comment out \n the IF block as marked within the"); - printf("code of routine SLAMC2, \n otherwise supply EMIN"); - printf("explicitly.\n"); - } -/* ** - - Assume IEEE arithmetic if we found denormalised numbers abo -ve, - or if arithmetic seems to round in the IEEE style, determi -ned - in routine SLAMC1. A true IEEE machine should have both thi -ngs - true; however, faulty machines may have one or the other. */ - - ieee = ieee || lieee1; - -/* Compute RMIN by successive division by BETA. We could comp -ute - RMIN as BASE**( EMIN - 1 ), but some machines underflow dur -ing - this computation. */ - - lrmin = 1.f; - i__1 = 1 - lemin; - for (i = 1; i <= 1-lemin; ++i) { - r__1 = lrmin * rbase; - lrmin = slamc3_(&r__1, &zero); -/* L30: */ - } - -/* Finally, call SLAMC5 to compute EMAX and RMAX. */ - - slamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *eps = leps; - *emin = lemin; - *rmin = lrmin; - *emax = lemax; - *rmax = lrmax; - - return 0; - - -/* End of SLAMC2 */ - -} /* slamc2_ */ - - -double slamc3_(float *a, float *b) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC3 is intended to force A and B to be stored prior to doing - - the addition of A and B , for use in situations where optimizers - - might hold one of these in a register. - - Arguments - ========= - - A, B (input) FLOAT - The values A and B. - - ===================================================================== -*/ -/* >>Start of File<< - System generated locals */ - float ret_val; - - - - ret_val = *a + *b; - - return ret_val; - -/* End of SLAMC3 */ - -} /* slamc3_ */ - - -/* Subroutine */ int slamc4_(int *emin, float *start, int *base) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC4 is a service routine for SLAMC2. - - Arguments - ========= - - EMIN (output) EMIN - The minimum exponent before (gradual) underflow, computed by - - setting A = START and dividing by BASE until the previous A - can not be recovered. - - START (input) FLOAT - The starting point for determining EMIN. - - BASE (input) INT - The base of the machine. - - ===================================================================== -*/ - /* System generated locals */ - int i__1; - float r__1; - /* Local variables */ - static float zero, a; - static int i; - static float rbase, b1, b2, c1, c2, d1, d2; - extern double slamc3_(float *, float *); - static float one; - - - - a = *start; - one = 1.f; - rbase = one / *base; - zero = 0.f; - *emin = 1; - r__1 = a * rbase; - b1 = slamc3_(&r__1, &zero); - c1 = a; - c2 = a; - d1 = a; - d2 = a; -/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. - $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ -L10: - if (c1 == a && c2 == a && d1 == a && d2 == a) { - --(*emin); - a = b1; - r__1 = a / *base; - b1 = slamc3_(&r__1, &zero); - r__1 = b1 * *base; - c1 = slamc3_(&r__1, &zero); - d1 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d1 += b1; -/* L20: */ - } - r__1 = a * rbase; - b2 = slamc3_(&r__1, &zero); - r__1 = b2 / rbase; - c2 = slamc3_(&r__1, &zero); - d2 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d2 += b2; -/* L30: */ - } - goto L10; - } -/* + END WHILE */ - - return 0; - -/* End of SLAMC4 */ - -} /* slamc4_ */ - - -/* Subroutine */ int slamc5_(int *beta, int *p, int *emin, - int *ieee, int *emax, float *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC5 attempts to compute RMAX, the largest machine floating-point - number, without overflow. It assumes that EMAX + abs(EMIN) sum - approximately to a power of 2. It will fail on machines where this - assumption does not hold, for example, the Cyber 205 (EMIN = -28625, - - EMAX = 28718). It will also fail if the value supplied for EMIN is - too large (i.e. too close to zero), probably with overflow. - - Arguments - ========= - - BETA (input) INT - The base of floating-point arithmetic. - - P (input) INT - The number of base BETA digits in the mantissa of a - floating-point value. - - EMIN (input) INT - The minimum exponent before (gradual) underflow. - - IEEE (input) INT - A logical flag specifying whether or not the arithmetic - system is thought to comply with the IEEE standard. - - EMAX (output) INT - The largest exponent before overflow - - RMAX (output) FLOAT - The largest machine floating-point number. - - ===================================================================== - - - - First compute LEXP and UEXP, two powers of 2 that bound - abs(EMIN). We then assume that EMAX + abs(EMIN) will sum - approximately to the bound that is closest to abs(EMIN). - (EMAX is the exponent of the required number RMAX). */ - /* Table of constant values */ - static float c_b5 = 0.f; - - /* System generated locals */ - int i__1; - float r__1; - /* Local variables */ - static int lexp; - static float oldy; - static int uexp, i; - static float y, z; - static int nbits; - extern double slamc3_(float *, float *); - static float recbas; - static int exbits, expsum, try__; - - - - lexp = 1; - exbits = 1; -L10: - try__ = lexp << 1; - if (try__ <= -(*emin)) { - lexp = try__; - ++exbits; - goto L10; - } - if (lexp == -(*emin)) { - uexp = lexp; - } else { - uexp = try__; - ++exbits; - } - -/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater - than or equal to EMIN. EXBITS is the number of bits needed to - store the exponent. */ - - if (uexp + *emin > -lexp - *emin) { - expsum = lexp << 1; - } else { - expsum = uexp << 1; - } - -/* EXPSUM is the exponent range, approximately equal to - EMAX - EMIN + 1 . */ - - *emax = expsum + *emin - 1; - nbits = exbits + 1 + *p; - -/* NBITS is the total number of bits needed to store a - floating-point number. */ - - if (nbits % 2 == 1 && *beta == 2) { - -/* Either there are an odd number of bits used to store a - floating-point number, which is unlikely, or some bits are - - not used in the representation of numbers, which is possible -, - (e.g. Cray machines) or the mantissa has an implicit bit, - (e.g. IEEE machines, Dec Vax machines), which is perhaps the - - most likely. We have to assume the last alternative. - If this is true, then we need to reduce EMAX by one because - - there must be some way of representing zero in an implicit-b -it - system. On machines like Cray, we are reducing EMAX by one - - unnecessarily. */ - - --(*emax); - } - - if (*ieee) { - -/* Assume we are on an IEEE machine which reserves one exponent - - for infinity and NaN. */ - - --(*emax); - } - -/* Now create RMAX, the largest machine number, which should - be equal to (1.0 - BETA**(-P)) * BETA**EMAX . - - First compute 1.0 - BETA**(-P), being careful that the - result is less than 1.0 . */ - - recbas = 1.f / *beta; - z = *beta - 1.f; - y = 0.f; - i__1 = *p; - for (i = 1; i <= *p; ++i) { - z *= recbas; - if (y < 1.f) { - oldy = y; - } - y = slamc3_(&y, &z); -/* L20: */ - } - if (y >= 1.f) { - y = oldy; - } - -/* Now multiply by BETA**EMAX to get RMAX. */ - - i__1 = *emax; - for (i = 1; i <= *emax; ++i) { - r__1 = y * *beta; - y = slamc3_(&r__1, &c_b5); -/* L30: */ - } - - *rmax = y; - return 0; - -/* End of SLAMC5 */ - -} /* slamc5_ */ - - -double pow_ri(float *ap, int *bp) -{ -double pow, x; -int n; - -pow = 1; -x = *ap; -n = *bp; - -if(n != 0) - { - if(n < 0) - { - n = -n; - x = 1/x; - } - for( ; ; ) - { - if(n & 01) - pow *= x; - if(n >>= 1) - x *= x; - else - break; - } - } -return(pow); -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,34 +0,0 @@ -#include - -main() -{ - /* Local variables */ - float base, emin, prec, emax, rmin, rmax, t, sfmin; - extern double slamch_(char *); - float rnd, eps; - - eps = slamch_("Epsilon"); - sfmin = slamch_("Safe minimum"); - base = slamch_("Base"); - prec = slamch_("Precision"); - t = slamch_("Number of digits in mantissa"); - rnd = slamch_("Rounding mode"); - emin = slamch_("Minnimum exponent"); - rmin = slamch_("Underflow threshold"); - emax = slamch_("Largest exponent"); - rmax = slamch_("Overflow threshold"); - - printf(" Epsilon = %e\n", eps); - printf(" Safe minimum = %e\n", sfmin); - printf(" Base = %.0f\n", base); - printf(" Precision = %e\n", prec); - printf(" Number of digits in mantissa = %.0f\n", t); - printf(" Rounding mode = %.0f\n", rnd); - printf(" Minimum exponent = %.0f\n", emin); - printf(" Underflow threshold = %e\n", rmin); - printf(" Largest exponent = %.0f\n", emax); - printf(" Overflow threshold = %e\n", rmax); - printf(" Reciprocal of safe minimum = %e\n", 1./sfmin); - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,54 +0,0 @@ -/* - * Purpose - * ======= - * Returns the time in seconds used by the process. - * - * Note: the timer function call is machine dependent. Use conditional - * compilation to choose the appropriate function. - * - */ - - -#ifdef SUN -/* - * It uses the system call gethrtime(3C), which is accurate to - * nanoseconds. -*/ -#include - -double SuperLU_timer_() { - return ( (double)gethrtime() / 1e9 ); -} - -#elif defined ( UNIX_TIMER ) - -#include -#include -#include -#include - -#ifndef CLK_TCK -#define CLK_TCK 60 -#endif - -double SuperLU_timer_() -{ - struct tms use; - double tmp; - times(&use); - tmp = use.tms_utime; - tmp += use.tms_stime; - return (double)(tmp) / (double) CLK_TCK; -} - -#else - -#include - -double SuperLU_timer_() -{ - return MPI_Wtime(); -} - -#endif - Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/testdlamch and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/testdlamch differ Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/testslamch and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/testslamch differ Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/testtimer and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/testtimer differ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/timertst.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/timertst.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/timertst.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/timertst.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -#include -#include - -void mysub(int n, double *x, double *y) -{ - return; -} - -main(int argc, char *argv[]) -{ - /* Parameters */ -#define NMAX 100 -#define ITS 10000 - - int i, j; - double alpha, avg, t1, t2, tnotim; - double x[NMAX], y[NMAX]; - double SuperLU_timer_(); - - MPI_Init( &argc, &argv ); - - /* Initialize X and Y */ - for (i = 0; i < NMAX; ++i) { - x[i] = 1.0 / (double)(i+1); - y[i] = (double)(NMAX - i) / (double)NMAX; - } - alpha = 0.315; - - /* Time 1,000,000 DAXPY operations */ - t1 = SuperLU_timer_(); - for (j = 0; j < ITS; ++j) { - for (i = 0; i < NMAX; ++i) - y[i] += alpha * x[i]; - alpha = -alpha; - } - t2 = SuperLU_timer_(); - printf("Time for 1,000,000 DAXPY ops = %10.3g seconds\n", t2-t1); - if ( t2-t1 > 0. ) - printf("DAXPY performance rate = %10.3g mflops\n", 2./(t2-t1)); - else - printf("*** Error: Time for operations was zero\n"); - - tnotim = t2 - t1; - - /* Time 1,000,000 DAXPY operations with SuperLU_timer_() - in the outer loop */ - t1 = SuperLU_timer_(); - for (j = 0; j < ITS; ++j) { - for (i = 0; i < NMAX; ++i) - y[i] += alpha * x[i]; - alpha = -alpha; - t2 = SuperLU_timer_(); - } - - /* Compute the time in milliseconds used by an average call to - SuperLU_timer_(). */ - printf("Including DSECND, time = %10.3g seconds\n", t2-t1); - avg = ( (t2 - t1) - tnotim )*1000. / (double)ITS; - printf("Average time for DSECND = %10.3g milliseconds\n", avg); - - /* Compute the equivalent number of floating point operations used - by an average call to DSECND. */ - if ( tnotim > 0. ) - printf("Equivalent floating point ops = %10.3g ops\n", - 1000.*avg / tnotim); - - mysub(NMAX, x, y); - - MPI_Finalize(); - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile --- hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,61 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: Makefile -# -# Purpose: Top-level Makefile -# -# Creation date: September 1, 1999 version 1.0 -# -# Modified: -# -############################################################################ - -############################ -# for hypre -############################ -all: - ( cd SRC; $(MAKE) all) - -install: - ( cd SRC; $(MAKE) install) - -clean: - ( cd SRC; $(MAKE) clean) - -distclean: clean - -############################ - -#include make.inc - -#all: install lib example - -#lib: superlulib - -example: - ( cd EXAMPLE; $(MAKE) ) - -#clean: cleanlib cleantesting - -#install: - ( cd INSTALL; $(MAKE) ) -# ( cd INSTALL; cp lsame.c ../SRC/; \ -# cp dlamch.c ../SRC/; cp slamch.c ../SRC/ ) - -blaslib: - ( cd CBLAS; $(MAKE) ) - -superlulib: - ( cd SRC; $(MAKE) ) - -cleanlib: - ( cd SRC; $(MAKE) clean ) - ( cd CBLAS; $(MAKE) clean ) - ( cd lib; rm -f *.a ) - -cleantesting: - ( cd INSTALL; $(MAKE) clean ) - ( cd EXAMPLE; $(MAKE) clean ) - ( cd FORTRAN; $(MAKE) clean ) diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile.dsuperlu hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile.dsuperlu --- hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile.dsuperlu 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile.dsuperlu 1970-01-01 00:00:00.000000000 +0000 @@ -1,45 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: Makefile -# -# Purpose: Top-level Makefile -# -# Creation date: September 1, 1999 version 1.0 -# -# Modified: -# -############################################################################ - -include make.inc - -all: install lib example - -lib: superlulib - -example: - ( cd EXAMPLE; $(MAKE) ) - -clean: cleanlib cleantesting - -install: - ( cd INSTALL; $(MAKE) ) -# ( cd INSTALL; cp lsame.c ../SRC/; \ -# cp dlamch.c ../SRC/; cp slamch.c ../SRC/ ) - -blaslib: - ( cd CBLAS; $(MAKE) ) - -superlulib: - ( cd SRC; $(MAKE) ) - -cleanlib: - ( cd SRC; $(MAKE) clean ) - ( cd CBLAS; $(MAKE) clean ) - ( cd lib; rm -f *.a ) - -cleantesting: - ( cd INSTALL; $(MAKE) clean ) - ( cd EXAMPLE; $(MAKE) clean ) - ( cd FORTRAN; $(MAKE) clean ) diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/make.inc hypre-2.13.0/src/FEI_mv/DSuperLU/make.inc --- hypre-2.11.2/src/FEI_mv/DSuperLU/make.inc 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/make.inc 1970-01-01 00:00:00.000000000 +0000 @@ -1,65 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1999 version alpha -# -# Modified: September 1, 1999 version 1.0 -# March 15, 2003 version 2.0 -# -# January 18, 2006 Sam Adams -# General Dynamics - Network Systems -# works for i386 Linux, with LAM-MPI 7.1.1 and GCC 4. -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _i386 - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = /home/chtong/HYPRE/linear_solvers/FEI_mv/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.0.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = /usr/lib/libblas.so.3 -METISLIB = -PARMETISLIB = -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -############################################################################ -# C compiler setup -CC = mpicc -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = -pipe -O2 -# -# NOOPTS should be set to be the C flags that turn off any optimization -NOOPTS = -############################################################################ -# FORTRAN compiler setup -FORTRAN = mpif77 -FFLAGS = -############################################################################ -#LOADER = mpif77 -LOADER = mpicc -LOADOPTS = -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DAdd__ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.altix hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.altix --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.altix 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.altix 1970-01-01 00:00:00.000000000 +0000 @@ -1,63 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: April 10, 2006 version 2.0 -# -# Modified: November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _altix - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/SuperLU_codes/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu$(PLAT).a - -MKLHOME = /usr/common/intel/mkl/8.1.014 -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = -L${MKLHOME}/lib/64 -lmkl_ipf -lguide -PARMETISLIB = -L$(HOME)/Codes/ParMetis-3.1 -lparmetis -METISLIB = -lmetis -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) \ - -lmpi -lm -L/usr/common/intel/fc/8.1.029/lib -lifcore -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = crv -RANLIB = ranlib - -####################################################################### -# C compiler setup -CC = icc -ISA = -ftz -mp -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = $(ISA) -O3 -DDEBUGlevel=0 -DPRNTlevel=0 -# -# NOOPTS should be set to be the C flags that turn off any optimization -NOOPTS = $(ISA) -O0 -############################################################################ -# FORTRAN compiler setup -FORTRAN = ifort -FFLAGS = $(CFLAGS) -F90FLAGS = -r8 -check all -save -Dmpi -ftz -############################################################################ -LOADER = icc -LOADOPTS = $(CFLAGS) -# -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DAdd_ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux 1970-01-01 00:00:00.000000000 +0000 @@ -1,64 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1999 version alpha -# -# Modified: September 1, 1999 version 1.0 -# March 15, 2003 version 2.0 -# -# January 18, 2006 Sam Adams -# General Dynamics - Network Systems -# works for i386 Linux, with LAM-MPI 7.1.1 and GCC 4. -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _i386 - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = /opt/SuperLU_DIST_2.0 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.0.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = /usr/lib/libblas.so.3 -METISLIB = -PARMETISLIB = -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -############################################################################ -# C compiler setup -CC = mpicc -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = -pipe -O2 -# -# NOOPTS should be set to be the C flags that turn off any optimization -NOOPTS = -############################################################################ -# FORTRAN compiler setup -FORTRAN = mpif77 -FFLAGS = -############################################################################ -LOADER = mpif77 -LOADOPTS = -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DAdd__ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron 1970-01-01 00:00:00.000000000 +0000 @@ -1,64 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1999 version alpha -# -# Modified: September 1, 1999 version 1.0 -# March 15, 2003 version 2.0 -# November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _jacquard - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = -L/usr/common/usg/acml/2.6.0/pathscale64/lib -lacml -lacml_mv -METISLIB = -PARMETISLIB = -#MPILIB = -L/usr/lpp/ppe.poe/lib -lmpi_r -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -############################################################################ -# C compiler setup -CC = mpicc -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = -O3 -DDEBUGlevel=0 -DPRNTlevel=1 -# -# NOOPTS should be set to be the C flags that turn off any optimization -NOOPTS = -O0 -############################################################################ -# FORTRAN compiler setup -FORTRAN = mpif90 -FFLAGS = -O3 -############################################################################ -LOADER = mpif90 - -# 32-bit: -LOADOPTS = -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DAdd_ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.origin hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.origin --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.origin 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.origin 1970-01-01 00:00:00.000000000 +0000 @@ -1,66 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1997 version 1.0 -# -# Modified: November 11, 2002 (by Tom Oppe) -# November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _sgi - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = -lscs -MPILIB = -lmpi -METISLIB = -PARMETISLIB = -# -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) \ - $(MPILIB) -lfortran - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = crv -RANLIB = touch - -####################################################################### -# C compiler setup -CC = cc -ISA = -64 -mips4 -TARG:platform=ip35 -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = $(ISA) -O2 -# -# NOOPTS should be set to be the C flags that turn off any optimization -# This must be enforced to compile the two routines: slamch.c and dlamch.c. -NOOPTS = $(ISA) -O0 -############################################################################ -# FORTRAN compiler setup -FORTRAN = f90 -FFLAGS = $(CFLAGS) -############################################################################ -LOADER = cc -LOADOPTS = $(CFLAGS) -# -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DAdd_ -DORIGIN diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp 1970-01-01 00:00:00.000000000 +0000 @@ -1,66 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1999 version alpha -# -# Modified: September 1, 1999 version 1.0 -# March 15, 2003 version 2.0 -# November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _sp - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = -lessl -#MPILIB = -L/usr/lpp/ppe.poe/lib -lmpi -#PERFLIB = -L/vol1/VAMPIR/lib -lVT -METISLIB = -PARMETISLIB = -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -############################################################################ -CC = mpcc -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = -D_SP -O3 -qarch=PWR3 -qalias=allptrs \ - -DDEBUGlevel=0 -DPRNTlevel=0 -# -# NOOPTS should be set to be the C flags that turn off any optimization -# This must be enforced to compile the two routines: slamch.c and dlamch.c. -NOOPTS = -############################################################################ -FORTRAN = mpxlf90 -FFLAGS = -WF,-Dsp -O3 -Q -qstrict -qfixed -qinit=f90ptr -qarch=pwr3 -############################################################################ -LOADER = mpxlf90 -#LOADOPTS = -bmaxdata:0x80000000 -LOADOPTS = -bmaxdata:0x70000000 -# -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DNoChange - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit 1970-01-01 00:00:00.000000000 +0000 @@ -1,71 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1999 version alpha -# -# Modified: September 1, 1999 version 1.0 -# March 15, 2003 version 2.0 -# November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _power5 - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/Release_Codes/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = -lessl -METISLIB = -L/usr/common/usg/parmetis/3.1 -lmetis -PARMETISLIB = -L/usr/common/usg/parmetis/3.1 -lparmetis -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -# 64-bit: -ARCHFLAGS = -X64 cr -RANLIB = ranlib - -############################################################################ -# C compiler setup -# 64-bit -CC = mpcc_r -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = -D_SP -qarch=pwr5 -qalias=allptrs -q64 \ - -DDEBUGlevel=0 -DPRNTlevel=0 -O3 -# -# NOOPTS should be set to be the C flags that turn off any optimization -# 64-bit -NOOPTS = -q64 - -############################################################################ -# FORTRAN compiler setup -# 64-bit -FORTRAN = mpxlf90_r -FFLAGS = -WF,-Dsp -O3 -Q -qstrict -qfixed -qinit=f90ptr -qarch=pwr5\ - -q64 #-qintsize=8 -############################################################################ -# 64-bit -LOADER = mpxlf90_r - -# 64-bit: -LOADOPTS = -q64 -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DNoChange diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e 1970-01-01 00:00:00.000000000 +0000 @@ -1,68 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1997 version alpha -# -# Modified: September 1, 1999 version 1.0 -# November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _t3e - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a -# -BLASDEF = -DUSE_VENDOR_BLAS -# -#PERFLIB = -l pat pat.cld -#PERFLIB = -lapp -METISLIB = -PARMETISLIB = -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -############################################################################ -# C compiler setup -CC = cc -# CFLAGS should be set to be the C flags that include optimization -#CFLAGS = -D_CRAY -DPRNTlevel=1 -O3 -h aggress,split,unroll -CFLAGS = -O3 -D_CRAY -DPRNTlevel=0 -DDEBUGlevel=0 -DPROFlevel=0 -# -happrentice,inline0 -PTROPT = -h restrict=a -# -# NOOPTS should be set to be the C flags that turn off any optimization -# This must be enforced to compile the two routines: slamch.c and dlamch.c. -NOOPTS = -############################################################################ -# FORTRAN compiler setup -FORTRAN = f90 -FFLAGS = -O3 -dp -i 32 -############################################################################ -LOADER = cc -LOADOPTS = -# -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DUpCase - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4 hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4 --- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4 1970-01-01 00:00:00.000000000 +0000 @@ -1,62 +0,0 @@ -############################################################################ -# -# Program: SuperLU_DIST -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: February 4, 1999 version alpha -# -# Modified: September 1, 1999 version 1.0 -# March 15, 2003 version 2.0 -# November 1, 2007 version 2.1 -# -############################################################################ -# -# The machine (platform) identifier to append to the library names -# -PLAT = _xt4 - -# -# The name of the libraries to be created/linked to -# -DSuperLUroot = $(HOME)/Release_Codes/SuperLU_DIST_2.2 -DSUPERLULIB = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a -# -BLASDEF = -DUSE_VENDOR_BLAS -BLASLIB = -METISLIB = -L/usr/common/usg/parmetis/3.1 -lmetis -PARMETISLIB = -L/usr/common/usg/parmetis/3.1 -lparmetis -FLIBS = -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl -LIBS = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS) - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -############################################################################ -# C compiler setup -CC = cc -# CFLAGS should be set to be the C flags that include optimization -CFLAGS = -fastsse -DDEBUGlevel=0 -DPRNTlevel=1 -# -# NOOPTS should be set to be the C flags that turn off any optimization -NOOPTS = -O0 -############################################################################ -# FORTRAN compiler setup -FORTRAN = ftn -FFLAGS = -fastsse -############################################################################ -LOADER = cc -LOADOPTS = -############################################################################ -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -# Need follow the convention of how C calls a Fortran routine. -# -CDEFS = -DAdd_ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/README hypre-2.13.0/src/FEI_mv/DSuperLU/README --- hypre-2.11.2/src/FEI_mv/DSuperLU/README 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,175 +0,0 @@ - SuperLU_DIST (version 2.1) - ========================== - -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -(1) Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. -(2) Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. -(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of -Energy nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -SuperLU_DIST contains a set of subroutines to solve a sparse linear system -A*X=B. It uses Gaussian elimination with static pivoting (GESP). -Static pivoting is a technique that combines the numerical stability of -partial pivoting with the scalability of Cholesky (no pivoting), -to run accurately and efficiently on large numbers of processors. - -SuperLU_DIST is a parallel extension to the serial SuperLU library. -It is targeted for the distributed memory parallel machines. -SuperLU_DIST is implemented in ANSI C, and MPI for communications. -Currently, the LU factorization and triangular solution routines, -which are the most time-consuming part of the solution process, -are parallelized. The other routines, such as static pivoting and -column preordering for sparsity are performed sequentially. -This "alpha" release contains double-precision real and double-precision -complex data types. - -The distribution contains the following directory structure: - - SuperLU_DIST/README instructions on installation - SuperLU_DIST/CBLAS/ needed BLAS routines in C, not necessarily fast - SuperLU_DIST/DOC/ the Users' Guide - SuperLU_DIST/EXAMPLE/ example programs - SuperLU_DIST/INSTALL/ test machine dependent parameters - SuperLU_DIST/SRC/ C source code, to be compiled into libsuperlu_dist.a - SuperLU_DIST/lib/ contains library archive libsuperlu_dist.a - SuperLU_DIST/Makefile top level Makefile that does installation and testing - SuperLU_DIST/make.inc compiler, compiler flags, library definitions and C - preprocessor definitions, included in all Makefiles. - (You may need to edit it to suit for your system - before compiling the whole package.) - SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files - -Before installing the package, please examine the three things dependent -on your system setup: - -1. Edit the make.inc include file. - - This make include file is referenced inside each of the Makefiles - in the various subdirectories. As a result, there is no need to - edit the Makefiles in the subdirectories. All information that is - machine specific has been defined in this include file. - - Sample machine-specific {\tt make.inc} are provided in the MAKE_INC/ - directory for several platforms, such as Cray T3E and IBM SP. - When you have selected the machine to which you wish to install - SuperLU_DIST, copy the appropriate sample include file (if one is present) - into make.inc. For example, if you wish to run SuperLU_DIST on a - Cray T3E, you can do - - cp MAKE_INC/make.t3e make.inc - - For the systems other than listed above, some porting effort is needed - for parallel factorization routines. Please refer to the Users' Guide - for detailed instructions on porting. - - -2. The BLAS library. - - The parallel routines in SuperLU_DIST uses some sequential BLAS routines - on each process. If there is BLAS library available on your machine, - you may define the following in the file make.inc: - BLASDEF = -DUSE_VENDOR_BLAS - BLASLIB = - - The CBLAS/ subdirectory contains the part of the C BLAS needed by - SuperLU_DIST package. However, these codes are intended for use only if - there is no faster implementation of the BLAS already available on your - machine. In this case, you should go to the top-level SuperLU_DIST/ - directory and do the following: - - 1) In make.inc, undefine (comment out) BLASDEF, and define: - BLASLIB = ../lib/libblas$(PLAT).a - - 2) Type: - make blaslib - to make the BLAS library from the routines in the CBLAS/ subdirectory. - - -3. External libraries: Metis and ParMetis. - - If you will use Metis or ParMetis ordering, or parallel - symbolic factorization (which depends on ParMetis), you will - need to install them yourself. Since ParMetis package already - contains the source code for the Metis library, you can just - download ParMetis at: - http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download - - After you have installed it, you should define the following in make.inc: - METISLIB = -L -lmetis - PARMETISLIB = -L -lparmetis - - -4. C preprocessor definition CDEFS. - - In the header file SRC/Cnames.h, we use macros to determine how - C routines should be named so that they are callable by Fortran. - (Some vendor-supplied BLAS libraries do not have C interfaces. So the - re-naming is needed in order for the SuperLU BLAS calls (in C) to - interface with the Fortran-style BLAS.) - The possible options for CDEFS are: - - o -DAdd_: Fortran expects a C routine to have an underscore - postfixed to the name; - o -DNoChange: Fortran expects a C routine name to be identical to - that compiled by C; - o -DUpCase: Fortran expects a C routine name to be all uppercase. - -A Makefile is provided in each subdirectory. The installation can be done -completely automatically by simply typing "make" at the top level. - - -REFERENCES - -[1] SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for - Unsymmetric Linear Systems. Xiaoye S. Li and James W. Demmel. - ACM Trans. on Math. Solftware, Vol. 29, No. 2, June 2003, pp. 110-140. -[2] Parallel Symbolic Factorization for Sparse LU with Static Pivoting. - L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3, - 1289-1314, 2007. - -Xiaoye S. Li Lawrence Berkeley National Lab, xsli@lbl.gov -James Demmel Univ. of California Berkeley, demmel@cs.berkeley.edu -Laura Grigori INRIA, France, Laura.Grigori@inria.fr - - ------------------ -| RELEASE NOTES | ------------------ - -* Version 2.0, 10-15-2003 - - "set_default_options" is renamed to "set_default_options_dist", - because sequential SuperLU now uses set_default_options. - - add a field in input "options" argument to control whether to - print the solver's statistics. - - add Fortran 90 wrapper in FORTRAN/ directory. - - fixed a bug in triangular solve, which is related to the unfreed - MPI_Irecv request handles. - -* Version 2.1, 10-01-2007 - - include parallel symbolic factorization capability. - - include Metis or ParMetis ordering algorithms. diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,348 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. ADD_ assumes that - * they will be called by fortran, which expects C routines to have an - * underscore postfixed to the name (Suns, and the Intel expect this). - * NOCHANGE indicates that fortran will be calling, and that it expects - * the name called by fortran to be identical to that compiled by the C - * (RS6K's do this). UPCASE says it expects C routines called by fortran - * to be in all upcase (CRAY wants this). - */ - -#define ADD_ 0 -#define NOCHANGE 1 -#define UPCASE 2 -#define C_CALL 3 - -#ifdef UpCase -#define F77_CALL_C UPCASE -#endif - -#ifdef NoChange -#define F77_CALL_C NOCHANGE -#endif - -#ifdef Add_ -#define F77_CALL_C ADD_ -#endif - -#ifndef F77_CALL_C -#define F77_CALL_C ADD_ -#endif - -#if (F77_CALL_C == ADD_) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm_(...) - * - * This is the default. - */ -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle_ -#define f_create_options_handle f_create_options_handle_ -#define f_create_ScalePerm_handle f_create_scaleperm_handle_ -#define f_create_LUstruct_handle f_create_lustruct_handle_ -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle_ -#define f_create_SuperMatrix_handle f_create_supermatrix_handle_ -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_ -#define f_destroy_options_handle f_destroy_options_handle_ -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_ -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle_ -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_ -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_ -#define f_create_SuperLUStat_handle f_create_superlustat_handle_ -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_ -#define f_get_gridinfo f_get_gridinfo_ -#define f_get_SuperMatrix f_get_supermatrix_ -#define f_set_SuperMatrix f_set_supermatrix_ -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_ -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_ -#define f_get_superlu_options f_get_superlu_options_ -#define f_set_superlu_options f_set_superlu_options_ -#define f_set_default_options f_set_default_options_ -#define f_superlu_gridinit f_superlu_gridinit_ -#define f_superlu_gridexit f_superlu_gridexit_ -#define f_ScalePermstructInit f_scalepermstructinit_ -#define f_ScalePermstructFree f_scalepermstructfree_ -#define f_PStatInit f_pstatinit_ -#define f_PStatFree f_pstatfree_ -#define f_LUstructInit f_lustructinit_ -#define f_LUstructFree f_lustructfree_ -#define f_Destroy_LU f_destroy_lu_ -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_ -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_ -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_ -#define f_dSolveFinalize f_dsolvefinalize_ -#define f_pdgssvx f_pdgssvx_ -#define f_dcreate_dist_matrix f_dcreate_dist_matrix_ -#define f_check_malloc f_check_malloc_ -#endif - -#if (F77_CALL_C == UPCASE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void DGEMM(...) - */ -/* BLAS */ -#define sasum_ SASUM -#define isamax_ ISAMAX -#define scopy_ SCOPY -#define sscal_ SSCAL -#define sger_ SGER -#define snrm2_ SNRM2 -#define ssymv_ SSYMV -#define sdot_ SDOT -#define saxpy_ SAXPY -#define ssyr2_ SSYR2 -#define srot_ SROT -#define sgemv_ SGEMV -#define strsv_ STRSV -#define sgemm_ SGEMM -#define strsm_ STRSM - -#define dasum_ DASUM -#define idamax_ IDAMAX -#define dcopy_ DCOPY -#define dscal_ DSCAL -#define dger_ DGER -#define dnrm2_ DNRM2 -#define dsymv_ DSYMV -#define ddot_ DDOT -#define daxpy_ DAXPY -#define dsyr2_ DSYR2 -#define drot_ DROT -#define dgemv_ DGEMV -#define dtrsv_ DTRSV -#define dgemm_ DGEMM -#define dtrsm_ DTRSM - -#define scasum_ SCASUM -#define icamax_ ICAMAX -#define ccopy_ CCOPY -#define cscal_ CSCAL -#define scnrm2_ SCNRM2 -#define caxpy_ CAXPY -#define cgemv_ CGEMV -#define ctrsv_ CTRSV -#define cgemm_ CGEMM -#define ctrsm_ CTRSM -#define cgerc_ CGERC -#define chemv_ CHEMV -#define cher2_ CHER2 - -#define dzasum_ DZASUM -#define izamax_ IZAMAX -#define zcopy_ ZCOPY -#define zscal_ ZSCAL -#define dznrm2_ DZNRM2 -#define zaxpy_ ZAXPY -#define zgemv_ ZGEMV -#define ztrsv_ ZTRSV -#define zgemm_ ZGEMM -#define ztrsm_ ZTRSM -#define zgerc_ ZGERC -#define zhemv_ ZHEMV -#define zher2_ ZHER2 -#define zgeru_ ZGERU - -/* LAPACK */ -#define dlamch_ DLAMCH -#define slamch_ SLAMCH -#define xerbla_ XERBLA -#define lsame_ LSAME - -#define mc64id_ MC64ID -#define mc64ad_ MC64AD -#define c_bridge_dgssv_ C_BRIDGE_DGSSV -#define c_fortran_slugrid_ C_FORTRAN_SLUGRID -#define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX -#define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL -#define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX -#define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE -#define f_create_options_handle F_CREATE_OPTIONS_HANDLE -#define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE -#define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE -#define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE -#define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE -#define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE -#define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE -#define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE -#define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE -#define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE -#define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE -#define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE -#define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE -#define f_get_gridinfo F_GET_GRIDINFO -#define f_get_SuperMatrix F_GET_SUPERMATRIX -#define f_set_SuperMatrix F_SET_SUPERMATRIX -#define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX -#define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX -#define f_get_superlu_options F_GET_SUPERLU_OPTIONS -#define f_set_superlu_options F_SET_SUPERLU_OPTIONS -#define f_set_default_options F_SET_DEFAULT_OPTIONS -#define f_superlu_gridinit F_SUPERLU_GRIDINIT -#define f_superlu_gridexit F_SUPERLU_GRIDEXIT -#define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT -#define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE -#define f_PStatInit F_PSTATINIT -#define f_PStatFree F_PSTATFREE -#define f_LUstructInit F_LUSTRUCTINIT -#define f_LUstructFree F_LUSTRUCTFREE -#define f_Destroy_LU F_DESTROY_LU -#define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST -#define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST -#define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST -#define f_dSolveFinalize F_DSOLVEFINALIZE -#define f_pdgssvx F_PDGSSVX -#define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX -#define f_check_malloc F_CHECK_MALLOC -#endif - -#if (F77_CALL_C == NOCHANGE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm(...) - */ -/* BLAS */ -#define sasum_ sasum -#define isamax_ isamax -#define scopy_ scopy -#define sscal_ sscal -#define sger_ sger -#define snrm2_ snrm2 -#define ssymv_ ssymv -#define sdot_ sdot -#define saxpy_ saxpy -#define ssyr2_ ssyr2 -#define srot_ srot -#define sgemv_ sgemv -#define strsv_ strsv -#define sgemm_ sgemm -#define strsm_ strsm - -#define dasum_ dasum -#define idamax_ idamax -#define dcopy_ dcopy -#define dscal_ dscal -#define dger_ dger -#define dnrm2_ dnrm2 -#define dsymv_ dsymv -#define ddot_ ddot -#define daxpy_ daxpy -#define dsyr2_ dsyr2 -#define drot_ drot -#define dgemv_ dgemv -#define dtrsv_ dtrsv -#define dgemm_ dgemm -#define dtrsm_ dtrsm - -#define scasum_ scasum -#define icamax_ icamax -#define ccopy_ ccopy -#define cscal_ cscal -#define scnrm2_ scnrm2 -#define caxpy_ caxpy -#define cgemv_ cgemv -#define ctrsv_ ctrsv -#define cgemm_ cgemm -#define ctrsm_ ctrsm -#define cgerc_ cgerc -#define chemv_ chemv -#define cher2_ cher2 - -#define dzasum_ dzasum -#define izamax_ izamax -#define zcopy_ zcopy -#define zscal_ zscal -#define dznrm2_ dznrm2 -#define zaxpy_ zaxpy -#define zgemv_ zgemv -#define ztrsv_ ztrsv -#define zgemm_ zgemm -#define ztrsm_ ztrsm -#define zgerc_ zgerc -#define zhemv_ zhemv -#define zher2_ zher2 -#define zgeru_ zgeru - -/* LAPACK */ -#define dlamch_ dlamch -#define slamch_ slamch -#define xerbla_ xerbla -#define lsame_ lsame - -#define mc64id_ mc64id -#define mc64ad_ mc64ad - -#define c_bridge_dgssv_ c_bridge_dgssv -#define c_fortran_slugrid_ c_fortran_slugrid -#define c_fortran_pdgssvx_ c_fortran_pdgssvx -#define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal -#define c_fortran_pzgssvx_ c_fortran_pzgssvx -#define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle -#define f_create_options_handle f_create_options_handle -#define f_create_ScalePerm_handle f_create_scaleperm_handle -#define f_create_LUstruct_handle f_create_lustruct_handle -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle -#define f_create_SuperMatrix_handle f_create_supermatrix_handle -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle -#define f_destroy_options_handle f_destroy_options_handle -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle -#define f_create_SuperLUStat_handle f_create_superlustat_handle -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle -#define f_get_gridinfo f_get_gridinfo -#define f_get_SuperMatrix f_get_supermatrix -#define f_set_SuperMatrix f_set_supermatrix -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix -#define f_get_superlu_options f_get_superlu_options -#define f_set_superlu_options f_set_superlu_options -#define f_set_default_options f_set_default_options -#define f_superlu_gridinit f_superlu_gridinit -#define f_superlu_gridexit f_superlu_gridexit -#define f_ScalePermstructInit f_scalepermstructinit -#define f_ScalePermstructFree f_scalepermstructfree -#define f_PStatInit f_pstatinit -#define f_PStatFree f_pstatfree -#define f_LUstructInit f_lustructinit -#define f_LUstructFree f_lustructfree -#define f_Destroy_LU f_destroy_lu -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist -#define f_dSolveFinalize f_dsolvefinalize -#define f_pdgssvx f_pdgssvx -#define f_dcreate_dist_matrix f_dcreate_dist_matrix -#define f_check_malloc f_check_malloc -#endif - -#include "_hypre_utilities.h" - -#endif /* __SUPERLU_CNAMES */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,285 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. ADD_ assumes that - * they will be called by fortran, which expects C routines to have an - * underscore postfixed to the name (Suns, and the Intel expect this). - * NOCHANGE indicates that fortran will be calling, and that it expects - * the name called by fortran to be identical to that compiled by the C - * (RS6K's do this). UPCASE says it expects C routines called by fortran - * to be in all upcase (CRAY wants this). - */ - -#define ADD_ 0 -#define NOCHANGE 1 -#define UPCASE 2 -#define C_CALL 3 - -#ifdef UpCase -#define F77_CALL_C UPCASE -#endif - -#ifdef NoChange -#define F77_CALL_C NOCHANGE -#endif - -#ifdef Add_ -#define F77_CALL_C ADD_ -#endif - -#ifndef F77_CALL_C -#define F77_CALL_C ADD_ -#endif - -#if (F77_CALL_C == ADD_) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm_(...) - * - * This is the default. - */ -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle_ -#define f_create_options_handle f_create_options_handle_ -#define f_create_ScalePerm_handle f_create_scaleperm_handle_ -#define f_create_LUstruct_handle f_create_lustruct_handle_ -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle_ -#define f_create_SuperMatrix_handle f_create_supermatrix_handle_ -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_ -#define f_destroy_options_handle f_destroy_options_handle_ -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_ -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle_ -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_ -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_ -#define f_create_SuperLUStat_handle f_create_superlustat_handle_ -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_ -#define f_get_gridinfo f_get_gridinfo_ -#define f_get_SuperMatrix f_get_supermatrix_ -#define f_set_SuperMatrix f_set_supermatrix_ -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_ -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_ -#define f_get_superlu_options f_get_superlu_options_ -#define f_set_superlu_options f_set_superlu_options_ -#define f_set_default_options f_set_default_options_ -#define f_superlu_gridinit f_superlu_gridinit_ -#define f_superlu_gridexit f_superlu_gridexit_ -#define f_ScalePermstructInit f_scalepermstructinit_ -#define f_ScalePermstructFree f_scalepermstructfree_ -#define f_PStatInit f_pstatinit_ -#define f_PStatFree f_pstatfree_ -#define f_LUstructInit f_lustructinit_ -#define f_LUstructFree f_lustructfree_ -#define f_Destroy_LU f_destroy_lu_ -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_ -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_ -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_ -#define f_dSolveFinalize f_dsolvefinalize_ -#define f_pdgssvx f_pdgssvx_ -#define f_dcreate_dist_matrix f_dcreate_dist_matrix_ -#define f_check_malloc f_check_malloc_ -#endif - -#if (F77_CALL_C == UPCASE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void DGEMM(...) - */ - -/* LAPACK */ -#define dlamch_ DLAMCH -#define slamch_ SLAMCH -#define xerbla_ XERBLA -#define lsame_ LSAME - -#define mc64id_ MC64ID -#define mc64ad_ MC64AD -#define c_bridge_dgssv_ C_BRIDGE_DGSSV -#define c_fortran_slugrid_ C_FORTRAN_SLUGRID -#define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX -#define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL -#define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX -#define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE -#define f_create_options_handle F_CREATE_OPTIONS_HANDLE -#define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE -#define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE -#define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE -#define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE -#define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE -#define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE -#define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE -#define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE -#define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE -#define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE -#define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE -#define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE -#define f_get_gridinfo F_GET_GRIDINFO -#define f_get_SuperMatrix F_GET_SUPERMATRIX -#define f_set_SuperMatrix F_SET_SUPERMATRIX -#define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX -#define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX -#define f_get_superlu_options F_GET_SUPERLU_OPTIONS -#define f_set_superlu_options F_SET_SUPERLU_OPTIONS -#define f_set_default_options F_SET_DEFAULT_OPTIONS -#define f_superlu_gridinit F_SUPERLU_GRIDINIT -#define f_superlu_gridexit F_SUPERLU_GRIDEXIT -#define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT -#define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE -#define f_PStatInit F_PSTATINIT -#define f_PStatFree F_PSTATFREE -#define f_LUstructInit F_LUSTRUCTINIT -#define f_LUstructFree F_LUSTRUCTFREE -#define f_Destroy_LU F_DESTROY_LU -#define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST -#define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST -#define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST -#define f_dSolveFinalize F_DSOLVEFINALIZE -#define f_pdgssvx F_PDGSSVX -#define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX -#define f_check_malloc F_CHECK_MALLOC -#endif - -#if (F77_CALL_C == NOCHANGE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm(...) - */ -/* BLAS */ -#define sasum_ sasum -#define isamax_ isamax -#define scopy_ scopy -#define sscal_ sscal -#define sger_ sger -#define snrm2_ snrm2 -#define ssymv_ ssymv -#define sdot_ sdot -#define saxpy_ saxpy -#define ssyr2_ ssyr2 -#define srot_ srot -#define sgemv_ sgemv -#define strsv_ strsv -#define sgemm_ sgemm -#define strsm_ strsm - -#define dasum_ dasum -#define idamax_ idamax -#define dcopy_ dcopy -#define dscal_ dscal -#define dger_ dger -#define dnrm2_ dnrm2 -#define dsymv_ dsymv -#define ddot_ ddot -#define daxpy_ daxpy -#define dsyr2_ dsyr2 -#define drot_ drot -#define dgemv_ dgemv -#define dtrsv_ dtrsv -#define dgemm_ dgemm -#define dtrsm_ dtrsm - -#define scasum_ scasum -#define icamax_ icamax -#define ccopy_ ccopy -#define cscal_ cscal -#define scnrm2_ scnrm2 -#define caxpy_ caxpy -#define cgemv_ cgemv -#define ctrsv_ ctrsv -#define cgemm_ cgemm -#define ctrsm_ ctrsm -#define cgerc_ cgerc -#define chemv_ chemv -#define cher2_ cher2 - -#define dzasum_ dzasum -#define izamax_ izamax -#define zcopy_ zcopy -#define zscal_ zscal -#define dznrm2_ dznrm2 -#define zaxpy_ zaxpy -#define zgemv_ zgemv -#define ztrsv_ ztrsv -#define zgemm_ zgemm -#define ztrsm_ ztrsm -#define zgerc_ zgerc -#define zhemv_ zhemv -#define zher2_ zher2 -#define zgeru_ zgeru - -/* LAPACK */ -#define dlamch_ dlamch -#define slamch_ slamch -#define xerbla_ xerbla -#define lsame_ lsame - -#define mc64id_ mc64id -#define mc64ad_ mc64ad - -#define c_bridge_dgssv_ c_bridge_dgssv -#define c_fortran_slugrid_ c_fortran_slugrid -#define c_fortran_pdgssvx_ c_fortran_pdgssvx -#define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal -#define c_fortran_pzgssvx_ c_fortran_pzgssvx_ -#define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle -#define f_create_options_handle f_create_options_handle -#define f_create_ScalePerm_handle f_create_scaleperm_handle -#define f_create_LUstruct_handle f_create_lustruct_handle -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle -#define f_create_SuperMatrix_handle f_create_supermatrix_handle -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle -#define f_destroy_options_handle f_destroy_options_handle -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle -#define f_create_SuperLUStat_handle f_create_superlustat_handle -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle -#define f_get_gridinfo f_get_gridinfo -#define f_get_SuperMatrix f_get_supermatrix -#define f_set_SuperMatrix f_set_supermatrix -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix -#define f_get_superlu_options f_get_superlu_options -#define f_set_superlu_options f_set_superlu_options -#define f_set_default_options f_set_default_options -#define f_superlu_gridinit f_superlu_gridinit -#define f_superlu_gridexit f_superlu_gridexit -#define f_ScalePermstructInit f_scalepermstructinit -#define f_ScalePermstructFree f_scalepermstructfree -#define f_PStatInit f_pstatinit -#define f_PStatFree f_pstatfree -#define f_LUstructInit f_lustructinit -#define f_LUstructFree f_lustructfree -#define f_Destroy_LU f_destroy_lu -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist -#define f_dSolveFinalize f_dsolvefinalize -#define f_pdgssvx f_pdgssvx -#define f_dcreate_dist_matrix f_dcreate_dist_matrix -#define f_check_malloc f_check_malloc -#endif - -#endif /* __SUPERLU_CNAMES */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/comm.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/comm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/comm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/comm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ -#include "superlu_ddefs.h" - - -void -bcast_tree(void *buf, int count, MPI_Datatype dtype, int root, int tag, - gridinfo_t *grid, int scope, int *recvcnt) -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Purpose - * ======= - * Broadcast an array of *dtype* numbers. The communication pattern - * is a tree with number of branches equal to NBRANCHES. - * The process ranks are between 0 and Np-1. - * - * The following two pairs of graphs give different ways of viewing the same - * algorithm. The first pair shows the trees as they should be visualized - * when examining the algorithm. The second pair are isomorphic graphs of - * of the first, which show the actual pattern of data movement. - * Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a - * hypercube broadcast (however, it does not require the nodes be a - * power of two to work). - * - * TREE BROADCAST, NBRANCHES = 2 * TREE BROADCAST, NBRANCHES = 3 - * - * root=2 - * i=4 &______________ * - * | \ * root=2 - * i=2 &______ &______ * i=3 &______________________ - * | \ | \ * | \ \ - * i=1 &__ &__ &__ &__ * i=1 &______ &______ &__ - * | \ | \ | \ | \ * | \ \ | \ \ | \ - * 2 3 4 5 6 7 0 1 * 2 3 4 5 6 7 0 1 - * - * - * ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS: - * - * 2 2 - * _________|_________ ___________|____________ - * / | \ / | | \ - * 6 4 3 5 0 3 4 - * / \ | / \ | - * 0 7 5 6 7 1 - * | - * 1 - * - * - * Arguments - * ========= - * - * scope - */ -{ - int Iam, i, j, Np, nbranches = 2; - int destdist; /* The distance of the destination node. */ - int mydist; /* My distance from root. */ - superlu_scope_t *scp; - MPI_Status status; - - if ( scope == COMM_COLUMN ) scp = &grid->cscp; - else if ( scope == ROW ) scp = &grid->rscp; - Np = scp->Np; - if ( Np < 2 ) return; - Iam = scp->Iam; - - if ( Iam == root ) { - for (i = nbranches; i < Np; i *= nbranches); - for (i /= nbranches; i > 0; i /= nbranches) { - for (j = 1; j < nbranches; ++j) { - destdist = i*j; - if ( destdist < Np ) - MPI_Send( buf, count, dtype, (Iam+destdist)%Np, - tag, scp->comm ); - } - } - } else { - mydist = (Np + Iam - root) % Np; - for (i = nbranches; i < Np; i *= nbranches); - for (i /= nbranches; (mydist%i); i /= nbranches); -/* MPI_Probe( MPI_ANY_SOURCE, tag, scp->comm, &status );*/ - MPI_Recv( buf, count, dtype, MPI_ANY_SOURCE, tag, scp->comm, &status ); - MPI_Get_count( &status, dtype, recvcnt ); - - /* I need to send data to others. */ - while ( (i > 1) && !(mydist%i) ) { - i /= nbranches; - for (j = 1; j < nbranches; ++j) { - destdist = mydist + j*i; - if ( destdist < Np ) - MPI_Send( buf, *recvcnt, dtype, (root+destdist)%Np, - tag, scp->comm ); - } - } - } -} /* BCAST_TREE */ - - - - - - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,105 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -/* - * This file defines common arithmetic operations for complex type. - */ - -#include -#include -#include "dcomplex.h" - - -/* Complex Division c = a/b */ -void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) -{ - double ratio, den; - double abr, abi, cr, ci; - - if( (abr = b->r) < 0.) - abr = - abr; - if( (abi = b->i) < 0.) - abi = - abi; - if( abr <= abi ) { - if (abi == 0) { - fprintf(stderr, "z_div.c: division by zero"); - exit(-1); - } - ratio = b->r / b->i ; - den = b->i * (1 + ratio*ratio); - cr = (a->r*ratio + a->i) / den; - ci = (a->i*ratio - a->r) / den; - } else { - ratio = b->i / b->r ; - den = b->r * (1 + ratio*ratio); - cr = (a->r + a->i*ratio) / den; - ci = (a->i - a->r*ratio) / den; - } - c->r = cr; - c->i = ci; -} - - -/* Returns sqrt(z.r^2 + z.i^2) */ -double z_abs(doublecomplex *z) -{ - double temp; - double real = z->r; - double imag = z->i; - - if (real < 0) real = -real; - if (imag < 0) imag = -imag; - if (imag > real) { - temp = real; - real = imag; - imag = temp; - } - if ((real+imag) == real) return(real); - - temp = imag/real; - temp = real*sqrt(1.0 + temp*temp); /*overflow!!*/ - return (temp); -} - - -/* Approximates the abs */ -/* Returns abs(z.r) + abs(z.i) */ -double z_abs1(doublecomplex *z) -{ - double real = z->r; - double imag = z->i; - - if (real < 0) real = -real; - if (imag < 0) imag = -imag; - - return (real + imag); -} - -/* Return the exponentiation */ -void z_exp(doublecomplex *r, doublecomplex *z) -{ - double expx; - - expx = exp(z->r); - r->r = expx * cos(z->i); - r->i = expx * sin(z->i); -} - -/* Return the complex conjugate */ -void d_cnjg(doublecomplex *r, doublecomplex *z) -{ - r->r = z->r; - r->i = -z->i; -} - -/* Return the imaginary part */ -double d_imag(doublecomplex *z) -{ - return (z->i); -} - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,71 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -/* - * This header file is to be included in source files z*.c - */ -#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */ -#define __SUPERLU_DCOMPLEX - -#include - -typedef struct { double r, i; } doublecomplex; - -/* - * These variables will be defined to be MPI datatypes for complex - * and double complex. I'm too lazy to declare - * these guys external in every file that needs them. - */ -extern MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX; - - -/* Macro definitions */ - -/* Complex Addition c = a + b */ -#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \ - (c)->i = (a)->i + (b)->i; } - -/* Complex Subtraction c = a - b */ -#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \ - (c)->i = (a)->i - (b)->i; } - -/* Complex-Double Multiplication */ -#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \ - (c)->i = (a)->i * (b); } - -/* Complex-Complex Multiplication */ -#define zz_mult(c, a, b) { \ - double cr, ci; \ - cr = (a)->r * (b)->r - (a)->i * (b)->i; \ - ci = (a)->i * (b)->r + (a)->r * (b)->i; \ - (c)->r = cr; \ - (c)->i = ci; \ - } - -/* Complex equality testing */ -#define z_eq(a, b) ( (a)->r == (b)->r && (a)->i == (b)->i ) - - -#ifdef __cplusplus -extern "C" { -#endif - -/* Prototypes for functions in dcomplex.c */ -void z_div(doublecomplex *, doublecomplex *, doublecomplex *); -double z_abs(doublecomplex *); /* exact */ -double z_abs1(doublecomplex *); /* approximate */ -void z_exp(doublecomplex *, doublecomplex *); -void d_cnjg(doublecomplex *r, doublecomplex *z); -double d_imag(doublecomplex *); - - -#ifdef __cplusplus - } -#endif - - -#endif /* __SUPERLU_DCOMPLEX */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/ddistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/ddistribute.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/ddistribute.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/ddistribute.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,725 +0,0 @@ - - -#include "superlu_ddefs.h" - -int_t -ddistribute(fact_t fact, int_t n, SuperMatrix *A, - Glu_freeable_t *Glu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * > 0, working storage required (in bytes). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, - len, len1, nsupc; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NCPformat *Astore; - double *a; - int_t *asub; - int_t *xa_begin, *xa_end; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - double *lusup, *uval; /* nonzero values in L and U */ - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t mem_use = 0, iword, dword; - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif -#if ( PROFlevel>=1 ) - double t, t_u, t_l; - int_t u_blks; -#endif - - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; -#if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - dword = sizeof(double); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter ddistribute()"); -#endif - - if ( fact == SamePattern_SameRowPerm ) { - /* --------------------------------------------------------------- - * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. - * --------------------------------------------------------------- */ - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; -#if ( PRNTlevel>=1 ) - mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; -#endif -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - /* Initialize Uval to zero. */ - for (lb = 0; lb < nrbu; ++lb) { - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - index = Ufstnz_br_ptr[lb]; - if ( index ) { - uval = Unzval_br_ptr[lb]; - len = index[1]; - for (i = 0; i < len; ++i) uval[i] = zero; - } /* if index != NULL */ - } /* for lb ... */ - - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA (for L), or into U directly. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - if ( gb < jb ) { /* in U */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - while ( (k = index[Urb_indptr[lb]]) < jb ) { - /* Skip nonzero values in this block */ - Urb_length[lb] += index[Urb_indptr[lb]+1]; - /* Move pointer to the next block */ - Urb_indptr[lb] += UB_DESCRIPTOR - + SuperSize( k ); - } - /*assert(k == jb);*/ - /* start fstnz */ - istart = Urb_indptr[lb] + UB_DESCRIPTOR; - len = Urb_length[lb]; - fsupc1 = FstBlockC( gb+1 ); - k = j - fsupc; - /* Sum the lengths of the leading columns */ - for (jj = 0; jj < k; ++jj) - len += fsupc1 - index[istart++]; - /*assert(irow>=index[istart]);*/ - uval[len + irow - index[istart]] = a[i]; - } else { /* in L; put in SPA first */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", - t_l, t_u, u_blks, nrbu); -#endif - - } else { - /* -------------------------------------------------- - * FIRST TIME CREATING THE L AND U DATA STRUCTURE. - * -------------------------------------------------- */ - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* No L and U data structures are available yet. - We need to set up the L and U data structures and propagate - the values of A into them. */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for ToRecv[]."); - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) - ABORT("Malloc fails for index[]."); -#if ( PRNTlevel>=1 ) - mem_use += k*sizeof(int_t*) + (j + nsupers)*iword; -#endif - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = intCalloc_dist(k)) ) - ABORT("Malloc fails for ToSendD[]."); - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); -#if ( PRNTlevel>=1 ) - mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword; -#endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - - - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) - ABORT("Malloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Urb_fstnz[lb] = BR_HEADER; - } /* for lb ... */ - - SUPERLU_FREE(Ucbs); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); -#endif -#if ( PRNTlevel>=1 ) - mem_use -= 2*k * iword; -#endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) - *sp_ienv_dist(3))))) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); -#if ( PRNTlevel>=1 ) - mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; -#endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; -#if ( PRNTlevel>=1 ) - mem_use += 4*k*sizeof(int_t*) + 2*len*iword; -#endif - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } - - jbrow = PROW( jb, grid ); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - kseen = 0; - dense_col = dense; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - bsendx_plist[ljb][pr] == EMPTY ) { - bsendx_plist[ljb][pr] = YES; - ++nbsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - fsupc1 = FstBlockC( gb+1 ); - if (rb_marker[lb] <= jb) { /* First time see - the block */ - rb_marker[lb] = jb + 1; - Urb_indptr[lb] = Urb_fstnz[lb];; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - Urb_indptr[lb] += UB_DESCRIPTOR; - /* Record the first location in index[] of the - next block */ - Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; - len = Urb_indptr[lb];/* Start fstnz in index */ - index[len-1] = 0; - for (k = 0; k < nsupc; ++k) - index[len+k] = fsupc1; - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { /* Already saw the block */ - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - /* Load the numerical values */ - k = fsupc1 - irow; /* No. of nonzeros in segment */ - index[len-1] += k; /* Increment block length in - Descriptor */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (ii = 0; ii < k; ++ii) { - uval[Urb_length[lb]++] = dense_col[irow + ii]; - dense_col[irow + ii] = zero; - } - } /* if myrow == pr ... */ - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { - fsendx_plist[ljb][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - Lrowind_bc_ptr[ljb] = index; - if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) { - fprintf(stderr, "col block %d ", jb); - ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); - } - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - /* Propagate the compressed row subscripts to Lindex[], and - the initial values of A from SPA into Lnzval[]. */ - lusup = Lnzval_bc_ptr[ljb]; - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = 0.0; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb] = NULL; - Lnzval_bc_ptr[ljb] = NULL; - } /* if nrbl ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - - } /* for jb ... */ - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); -#endif - - } /* else fact != SamePattern_SameRowPerm */ - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit ddistribute()"); -#endif - - return (mem_use); -} /* DDISTRIBUTE */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dgsequ.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,178 +0,0 @@ - - -/* - * File name: dgsequ.c - * History: Modified from LAPACK routine DGEEQU - */ -#include -#include "superlu_ddefs.h" - -void -dgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int_t *info) -{ -/* - Purpose - ======= - - DGSEQU_dist computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. - - R (output) double*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) double*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) double* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) double* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) double* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= M: the i-th row of A is exactly zero - > M: the (i-M)-th column of A is exactly zero - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - double *Aval; - int i, j, irow; - double rcmin, rcmax; - double bignum, smlnum; - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("dgsequ_dist", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = (NCformat *) A->Store; - Aval = (double *) Astore->nzval; - - /* Get machine constants. */ - smlnum = dlamch_("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = 0; i < A->nrow; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - return; - -} /* dgsequ_dist */ - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlamch.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlamch.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,964 +0,0 @@ -#include -#include "Cnames.h" -#define TRUE_ (1) -#define FALSE_ (0) -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) - -double dlamch_(char *cmach) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - Purpose - ======= - - DLAMCH determines double precision machine parameters. - - Arguments - ========= - - CMACH (input) CHARACTER*1 - Specifies the value to be returned by DLAMCH: - = 'E' or 'e', DLAMCH := eps - = 'S' or 's , DLAMCH := sfmin - = 'B' or 'b', DLAMCH := base - = 'P' or 'p', DLAMCH := eps*base - = 'N' or 'n', DLAMCH := t - = 'R' or 'r', DLAMCH := rnd - = 'M' or 'm', DLAMCH := emin - = 'U' or 'u', DLAMCH := rmin - = 'L' or 'l', DLAMCH := emax - = 'O' or 'o', DLAMCH := rmax - - where - - eps = relative machine precision - sfmin = safe minimum, such that 1/sfmin does not overflow - base = base of the machine - prec = eps*base - t = number of (base) digits in the mantissa - rnd = 1.0 when rounding occurs in addition, 0.0 otherwise - emin = minimum exponent before (gradual) underflow - rmin = underflow threshold - base**(emin-1) - emax = largest exponent before overflow - rmax = overflow threshold - (base**emax)*(1-eps) - - ===================================================================== -*/ - - static int first = TRUE_; - - /* System generated locals */ - int i__1; - double ret_val; - /* Builtin functions */ - double pow_di(double *, int *); - /* Local variables */ - static double base; - static int beta; - static double emin, prec, emax; - static int imin, imax; - static int lrnd; - static double rmin, rmax, t, rmach; - extern int lsame_(char *, char *); - static double small, sfmin; - extern /* Subroutine */ int dlamc2_(int *, int *, int *, - double *, int *, double *, int *, double *); - static int it; - static double rnd, eps; - - if (first) { - first = FALSE_; - dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); - base = (double) beta; - t = (double) it; - if (lrnd) { - rnd = 1.; - i__1 = 1 - it; - eps = pow_di(&base, &i__1) / 2; - } else { - rnd = 0.; - i__1 = 1 - it; - eps = pow_di(&base, &i__1); - } - prec = eps * base; - emin = (double) imin; - emax = (double) imax; - sfmin = rmin; - small = 1. / rmax; - if (small >= sfmin) { - - /* Use SMALL plus a bit, to avoid the possibility of rounding - causing overflow when computing 1/sfmin. */ - sfmin = small * (eps + 1.); - } - } - - if (lsame_(cmach, "E")) { - rmach = eps; - } else if (lsame_(cmach, "S")) { - rmach = sfmin; - } else if (lsame_(cmach, "B")) { - rmach = base; - } else if (lsame_(cmach, "P")) { - rmach = prec; - } else if (lsame_(cmach, "N")) { - rmach = t; - } else if (lsame_(cmach, "R")) { - rmach = rnd; - } else if (lsame_(cmach, "M")) { - rmach = emin; - } else if (lsame_(cmach, "U")) { - rmach = rmin; - } else if (lsame_(cmach, "L")) { - rmach = emax; - } else if (lsame_(cmach, "O")) { - rmach = rmax; - } - - ret_val = rmach; - return ret_val; - -/* End of DLAMCH */ - -} /* dlamch_ */ - - -/* Subroutine */ int dlamc1_(int *beta, int *t, int *rnd, int - *ieee1) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC1 determines the machine parameters given by BETA, T, RND, and - IEEE1. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - IEEE1 (output) INT - Specifies whether rounding appears to be done in the IEEE - 'round to nearest' style. - - Further Details - =============== - - The routine is based on the routine ENVRON by Malcolm and - incorporates suggestions by Gentleman and Marovich. See - - Malcolm M. A. (1972) Algorithms to reveal properties of - floating-point arithmetic. Comms. of the ACM, 15, 949-951. - - Gentleman W. M. and Marovich S. B. (1974) More on algorithms - that reveal properties of floating point arithmetic units. - Comms. of the ACM, 17, 276-277. - - ===================================================================== -*/ - /* Initialized data */ - static int first = TRUE_; - /* System generated locals */ - double d__1, d__2; - /* Local variables */ - static int lrnd; - static double a, b, c, f; - static int lbeta; - static double savec; - extern double dlamc3_(double *, double *); - static int lieee1; - static double t1, t2; - static int lt; - static double one, qtr; - - if (first) { - first = FALSE_; - one = 1.; - -/* LBETA, LIEEE1, LT and LRND are the local values of BE -TA, - IEEE1, T and RND. - - Throughout this routine we use the function DLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - Compute a = 2.0**m with the smallest positive integer m s -uch - that - - fl( a + 1.0 ) = a. */ - - a = 1.; - c = 1.; - -/* + WHILE( C.EQ.ONE )LOOP */ -L10: - if (c == one) { - a *= 2; - c = dlamc3_(&a, &one); - d__1 = -a; - c = dlamc3_(&c, &d__1); - goto L10; - } -/* + END WHILE - - Now compute b = 2.0**m with the smallest positive integer -m - such that - - fl( a + b ) .gt. a. */ - - b = 1.; - c = dlamc3_(&a, &b); - -/* + WHILE( C.EQ.A )LOOP */ -L20: - if (c == a) { - b *= 2; - c = dlamc3_(&a, &b); - goto L20; - } -/* + END WHILE - - Now compute the base. a and c are neighbouring floating po -int - numbers in the interval ( beta**t, beta**( t + 1 ) ) and - so - their difference is beta. Adding 0.25 to c is to ensure that - it - is truncated to beta and not ( beta - 1 ). */ - - qtr = one / 4; - savec = c; - d__1 = -a; - c = dlamc3_(&c, &d__1); - lbeta = (int) (c + qtr); - -/* Now determine whether rounding or chopping occurs, by addin -g a - bit less than beta/2 and a bit more than beta/2 to - a. */ - - b = (double) lbeta; - d__1 = b / 2; - d__2 = -b / 100; - f = dlamc3_(&d__1, &d__2); - c = dlamc3_(&f, &a); - if (c == a) { - lrnd = TRUE_; - } else { - lrnd = FALSE_; - } - d__1 = b / 2; - d__2 = b / 100; - f = dlamc3_(&d__1, &d__2); - c = dlamc3_(&f, &a); - if (lrnd && c == a) { - lrnd = FALSE_; - } - -/* Try and decide whether rounding is done in the IEEE 'round - to - nearest' style. B/2 is half a unit in the last place of the -two - numbers A and SAVEC. Furthermore, A is even, i.e. has last -bit - zero, and SAVEC is odd. Thus adding B/2 to A should not cha -nge - A, but adding B/2 to SAVEC should change SAVEC. */ - - d__1 = b / 2; - t1 = dlamc3_(&d__1, &a); - d__1 = b / 2; - t2 = dlamc3_(&d__1, &savec); - lieee1 = t1 == a && t2 > savec && lrnd; - -/* Now find the mantissa, t. It should be the integer part - of - log to the base beta of a, however it is safer to determine - t - by powering. So we find t as the smallest positive integer -for - which - - fl( beta**t + 1.0 ) = 1.0. */ - - lt = 0; - a = 1.; - c = 1.; - -/* + WHILE( C.EQ.ONE )LOOP */ -L30: - if (c == one) { - ++lt; - a *= lbeta; - c = dlamc3_(&a, &one); - d__1 = -a; - c = dlamc3_(&c, &d__1); - goto L30; - } -/* + END WHILE */ - - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *ieee1 = lieee1; - return 0; - -/* End of DLAMC1 */ - -} /* dlamc1_ */ - - -/* Subroutine */ int dlamc2_(int *beta, int *t, int *rnd, - double *eps, int *emin, double *rmin, int *emax, - double *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC2 determines the machine parameters specified in its argument - list. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - EPS (output) DOUBLE PRECISION - The smallest positive number such that - - fl( 1.0 - EPS ) .LT. 1.0, - - where fl denotes the computed value. - - EMIN (output) INT - The minimum exponent before (gradual) underflow occurs. - - RMIN (output) DOUBLE PRECISION - The smallest normalized number for the machine, given by - BASE**( EMIN - 1 ), where BASE is the floating point value - - of BETA. - - EMAX (output) INT - The maximum exponent before overflow occurs. - - RMAX (output) DOUBLE PRECISION - The largest positive number for the machine, given by - BASE**EMAX * ( 1 - EPS ), where BASE is the floating point - - value of BETA. - - Further Details - =============== - - The computation of EPS is based on a routine PARANOIA by - W. Kahan of the University of California at Berkeley. - - ===================================================================== -*/ - /* Table of constant values */ - static int c__1 = 1; - - /* Initialized data */ - static int first = TRUE_; - static int iwarn = FALSE_; - /* System generated locals */ - int i__1; - double d__1, d__2, d__3, d__4, d__5; - /* Builtin functions */ - double pow_di(double *, int *); - /* Local variables */ - static int ieee; - static double half; - static int lrnd; - static double leps, zero, a, b, c; - static int i, lbeta; - static double rbase; - static int lemin, lemax, gnmin; - static double small; - static int gpmin; - static double third, lrmin, lrmax, sixth; - extern /* Subroutine */ int dlamc1_(int *, int *, int *, - int *); - extern double dlamc3_(double *, double *); - static int lieee1; - extern /* Subroutine */ int dlamc4_(int *, double *, int *), - dlamc5_(int *, int *, int *, int *, int *, - double *); - static int lt, ngnmin, ngpmin; - static double one, two; - - if (first) { - first = FALSE_; - zero = 0.; - one = 1.; - two = 2.; - -/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values - of - BETA, T, RND, EPS, EMIN and RMIN. - - Throughout this routine we use the function DLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. -*/ - - dlamc1_(&lbeta, <, &lrnd, &lieee1); - -/* Start to find EPS. */ - - b = (double) lbeta; - i__1 = -lt; - a = pow_di(&b, &i__1); - leps = a; - -/* Try some tricks to see whether or not this is the correct E -PS. */ - - b = two / 3; - half = one / 2; - d__1 = -half; - sixth = dlamc3_(&b, &d__1); - third = dlamc3_(&sixth, &sixth); - d__1 = -half; - b = dlamc3_(&third, &d__1); - b = dlamc3_(&b, &sixth); - b = abs(b); - if (b < leps) { - b = leps; - } - - leps = 1.; - -/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ -L10: - if (leps > b && b > zero) { - leps = b; - d__1 = half * leps; -/* Computing 5th power */ - d__3 = two, d__4 = d__3, d__3 *= d__3; -/* Computing 2nd power */ - d__5 = leps; - d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5); - c = dlamc3_(&d__1, &d__2); - d__1 = -c; - c = dlamc3_(&half, &d__1); - b = dlamc3_(&half, &c); - d__1 = -b; - c = dlamc3_(&half, &d__1); - b = dlamc3_(&half, &c); - goto L10; - } -/* + END WHILE */ - - if (a < leps) { - leps = a; - } - -/* Computation of EPS complete. - - Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 -)). - Keep dividing A by BETA until (gradual) underflow occurs. T -his - is detected when we cannot recover the previous A. */ - - rbase = one / lbeta; - small = one; - for (i = 1; i <= 3; ++i) { - d__1 = small * rbase; - small = dlamc3_(&d__1, &zero); -/* L20: */ - } - a = dlamc3_(&one, &small); - dlamc4_(&ngpmin, &one, &lbeta); - d__1 = -one; - dlamc4_(&ngnmin, &d__1, &lbeta); - dlamc4_(&gpmin, &a, &lbeta); - d__1 = -a; - dlamc4_(&gnmin, &d__1, &lbeta); - ieee = FALSE_; - - if (ngpmin == ngnmin && gpmin == gnmin) { - if (ngpmin == gpmin) { - lemin = ngpmin; -/* ( Non twos-complement machines, no gradual under -flow; - e.g., VAX ) */ - } else if (gpmin - ngpmin == 3) { - lemin = ngpmin - 1 + lt; - ieee = TRUE_; -/* ( Non twos-complement machines, with gradual und -erflow; - e.g., IEEE standard followers ) */ - } else { - lemin = min(ngpmin,gpmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if (ngpmin == gpmin && ngnmin == gnmin) { - if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { - lemin = max(ngpmin,ngnmin); -/* ( Twos-complement machines, no gradual underflow -; - e.g., CYBER 205 ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) - { - if (gpmin - min(ngpmin,ngnmin) == 3) { - lemin = max(ngpmin,ngnmin) - 1 + lt; -/* ( Twos-complement machines with gradual underflo -w; - no known machine ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else { -/* Computing MIN */ - i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); - lemin = min(i__1,gnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } -/* ** - Comment out this if block if EMIN is ok */ - if (iwarn) { - first = TRUE_; - printf("\n\n WARNING. The value EMIN may be incorrect:- "); - printf("EMIN = %8i\n",lemin); - printf("If, after inspection, the value EMIN looks acceptable"); - printf("please comment out \n the IF block as marked within the"); - printf("code of routine DLAMC2, \n otherwise supply EMIN"); - printf("explicitly.\n"); - } -/* ** - - Assume IEEE arithmetic if we found denormalised numbers abo -ve, - or if arithmetic seems to round in the IEEE style, determi -ned - in routine DLAMC1. A true IEEE machine should have both thi -ngs - true; however, faulty machines may have one or the other. */ - - ieee = ieee || lieee1; - -/* Compute RMIN by successive division by BETA. We could comp -ute - RMIN as BASE**( EMIN - 1 ), but some machines underflow dur -ing - this computation. */ - - lrmin = 1.; - i__1 = 1 - lemin; - for (i = 1; i <= 1-lemin; ++i) { - d__1 = lrmin * rbase; - lrmin = dlamc3_(&d__1, &zero); -/* L30: */ - } - -/* Finally, call DLAMC5 to compute EMAX and RMAX. */ - - dlamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *eps = leps; - *emin = lemin; - *rmin = lrmin; - *emax = lemax; - *rmax = lrmax; - - return 0; - - -/* End of DLAMC2 */ - -} /* dlamc2_ */ - - -double dlamc3_(double *a, double *b) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC3 is intended to force A and B to be stored prior to doing - - the addition of A and B , for use in situations where optimizers - - might hold one of these in a register. - - Arguments - ========= - - A, B (input) DOUBLE PRECISION - The values A and B. - - ===================================================================== -*/ -/* >>Start of File<< - System generated locals */ - double ret_val; - - ret_val = *a + *b; - - return ret_val; - -/* End of DLAMC3 */ - -} /* dlamc3_ */ - - -/* Subroutine */ int dlamc4_(int *emin, double *start, int *base) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC4 is a service routine for DLAMC2. - - Arguments - ========= - - EMIN (output) EMIN - The minimum exponent before (gradual) underflow, computed by - - setting A = START and dividing by BASE until the previous A - can not be recovered. - - START (input) DOUBLE PRECISION - The starting point for determining EMIN. - - BASE (input) INT - The base of the machine. - - ===================================================================== -*/ - /* System generated locals */ - int i__1; - double d__1; - /* Local variables */ - static double zero, a; - static int i; - static double rbase, b1, b2, c1, c2, d1, d2; - extern double dlamc3_(double *, double *); - static double one; - - a = *start; - one = 1.; - rbase = one / *base; - zero = 0.; - *emin = 1; - d__1 = a * rbase; - b1 = dlamc3_(&d__1, &zero); - c1 = a; - c2 = a; - d1 = a; - d2 = a; -/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. - $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ -L10: - if (c1 == a && c2 == a && d1 == a && d2 == a) { - --(*emin); - a = b1; - d__1 = a / *base; - b1 = dlamc3_(&d__1, &zero); - d__1 = b1 * *base; - c1 = dlamc3_(&d__1, &zero); - d1 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d1 += b1; -/* L20: */ - } - d__1 = a * rbase; - b2 = dlamc3_(&d__1, &zero); - d__1 = b2 / rbase; - c2 = dlamc3_(&d__1, &zero); - d2 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d2 += b2; -/* L30: */ - } - goto L10; - } -/* + END WHILE */ - - return 0; - -/* End of DLAMC4 */ - -} /* dlamc4_ */ - - -/* Subroutine */ int dlamc5_(int *beta, int *p, int *emin, - int *ieee, int *emax, double *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC5 attempts to compute RMAX, the largest machine floating-point - number, without overflow. It assumes that EMAX + abs(EMIN) sum - approximately to a power of 2. It will fail on machines where this - assumption does not hold, for example, the Cyber 205 (EMIN = -28625, - - EMAX = 28718). It will also fail if the value supplied for EMIN is - too large (i.e. too close to zero), probably with overflow. - - Arguments - ========= - - BETA (input) INT - The base of floating-point arithmetic. - - P (input) INT - The number of base BETA digits in the mantissa of a - floating-point value. - - EMIN (input) INT - The minimum exponent before (gradual) underflow. - - IEEE (input) INT - A int flag specifying whether or not the arithmetic - system is thought to comply with the IEEE standard. - - EMAX (output) INT - The largest exponent before overflow - - RMAX (output) DOUBLE PRECISION - The largest machine floating-point number. - - ===================================================================== - - - - First compute LEXP and UEXP, two powers of 2 that bound - abs(EMIN). We then assume that EMAX + abs(EMIN) will sum - approximately to the bound that is closest to abs(EMIN). - (EMAX is the exponent of the required number RMAX). */ - /* Table of constant values */ - static double c_b5 = 0.; - - /* System generated locals */ - int i__1; - double d__1; - /* Local variables */ - static int lexp; - static double oldy; - static int uexp, i; - static double y, z; - static int nbits; - extern double dlamc3_(double *, double *); - static double recbas; - static int exbits, expsum, try__; - - - - lexp = 1; - exbits = 1; -L10: - try__ = lexp << 1; - if (try__ <= -(*emin)) { - lexp = try__; - ++exbits; - goto L10; - } - if (lexp == -(*emin)) { - uexp = lexp; - } else { - uexp = try__; - ++exbits; - } - -/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater - than or equal to EMIN. EXBITS is the number of bits needed to - store the exponent. */ - - if (uexp + *emin > -lexp - *emin) { - expsum = lexp << 1; - } else { - expsum = uexp << 1; - } - -/* EXPSUM is the exponent range, approximately equal to - EMAX - EMIN + 1 . */ - - *emax = expsum + *emin - 1; - nbits = exbits + 1 + *p; - -/* NBITS is the total number of bits needed to store a - floating-point number. */ - - if (nbits % 2 == 1 && *beta == 2) { - -/* Either there are an odd number of bits used to store a - floating-point number, which is unlikely, or some bits are - - not used in the representation of numbers, which is possible -, - (e.g. Cray machines) or the mantissa has an implicit bit, - (e.g. IEEE machines, Dec Vax machines), which is perhaps the - - most likely. We have to assume the last alternative. - If this is true, then we need to reduce EMAX by one because - - there must be some way of representing zero in an implicit-b -it - system. On machines like Cray, we are reducing EMAX by one - - unnecessarily. */ - - --(*emax); - } - - if (*ieee) { - -/* Assume we are on an IEEE machine which reserves one exponent - - for infinity and NaN. */ - - --(*emax); - } - -/* Now create RMAX, the largest machine number, which should - be equal to (1.0 - BETA**(-P)) * BETA**EMAX . - - First compute 1.0 - BETA**(-P), being careful that the - result is less than 1.0 . */ - - recbas = 1. / *beta; - z = *beta - 1.; - y = 0.; - i__1 = *p; - for (i = 1; i <= *p; ++i) { - z *= recbas; - if (y < 1.) { - oldy = y; - } - y = dlamc3_(&y, &z); -/* L20: */ - } - if (y >= 1.) { - y = oldy; - } - -/* Now multiply by BETA**EMAX to get RMAX. */ - - i__1 = *emax; - for (i = 1; i <= *emax; ++i) { - d__1 = y * *beta; - y = dlamc3_(&d__1, &c_b5); -/* L30: */ - } - - *rmax = y; - return 0; - -/* End of DLAMC5 */ - -} /* dlamc5_ */ - -double pow_di(double *ap, int *bp) -{ - double pow, x; - int n; - - pow = 1; - x = *ap; - n = *bp; - - if(n != 0){ - if(n < 0) { - n = -n; - x = 1/x; - } - for( ; ; ) { - if(n & 01) pow *= x; - if(n >>= 1) x *= x; - else break; - } - } - return(pow); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlangs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - - -/* - * File name: dlangs.c - * History: Modified from lapack routine DLANGE - */ -#include -#include "superlu_ddefs.h" - -double dlangs_dist(char *norm, SuperMatrix *A) -{ -/* - Purpose - ======= - - DLANGS_dist returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - DLANGE returns the value - - DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in DLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - double *Aval; - int_t i, j, irow; - double value=0., sum; - double *rwork; - - Astore = (NCformat *) A->Store; - Aval = (double *) Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - value = SUPERLU_MAX( value, fabs( Aval[i]) ); - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += fabs(Aval[i]); - value = SUPERLU_MAX(value, sum); - } - - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for rwork."); - for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { - irow = Astore->rowind[i]; - rwork[irow] += fabs(Aval[i]); - } - value = 0.; - for (i = 0; i < A->nrow; ++i) - value = SUPERLU_MAX(value, rwork[i]); - - SUPERLU_FREE (rwork); - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else - ABORT("Illegal norm specified."); - - return (value); - -} /* dlangs_dist */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlaqgs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,130 +0,0 @@ - - -/* - * File name: dlaqgs.c - * History: Modified from LAPACK routine DLAQGE - */ -#include -#include "superlu_ddefs.h" - -void -dlaqgs_dist(SuperMatrix *A, double *r, double *c, - double rowcnd, double colcnd, double amax, char *equed) -{ -/* - Purpose - ======= - - DLAQGS_dist equilibrates a general sparse M by N matrix A using the row - and column scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. - - R (input) double*, dimension (A->nrow) - The row scale factors for A. - - C (input) double*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) double - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) double - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) double - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NCformat *Astore; - double *Aval; - int_t i, j, irow; - double large, small, cj; - extern double dlamch_(char *); - - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = (NCformat *) A->Store; - Aval = (double *) Astore->nzval; - - /* Initialize LARGE and SMALL. */ - small = dlamch_("Safe minimum") / dlamch_("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - Aval[i] *= cj; - } - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - Aval[i] *= r[irow]; - } - *(unsigned char *)equed = 'R'; - } else { - /* Row and column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - Aval[i] *= cj * r[irow]; - } - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* dlaqgs_dist */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dldperm.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dldperm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dldperm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dldperm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,157 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - -extern void mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [], - int_t*, int_t [], int_t*, int_t[], int_t*, double [], - int_t [], int_t []); - -void -dldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], - double nzval[], int_t *perm, double u[], double v[]) -{ -/* - * Purpose - * ======= - * - * DLDPERM finds a row permutation so that the matrix has large - * entries on the diagonal. - * - * Arguments - * ========= - * - * job (input) int - * Control the action. Possible values for JOB are: - * = 1 : Compute a row permutation of the matrix so that the - * permuted matrix has as many entries on its diagonal as - * possible. The values on the diagonal are of arbitrary size. - * HSL subroutine MC21A/AD is used for this. - * = 2 : Compute a row permutation of the matrix so that the smallest - * value on the diagonal of the permuted matrix is maximized. - * = 3 : Compute a row permutation of the matrix so that the smallest - * value on the diagonal of the permuted matrix is maximized. - * The algorithm differs from the one used for JOB = 2 and may - * have quite a different performance. - * = 4 : Compute a row permutation of the matrix so that the sum - * of the diagonal entries of the permuted matrix is maximized. - * = 5 : Compute a row permutation of the matrix so that the product - * of the diagonal entries of the permuted matrix is maximized - * and vectors to scale the matrix so that the nonzero diagonal - * entries of the permuted matrix are one in absolute value and - * all the off-diagonal entries are less than or equal to one in - * absolute value. - * Restriction: 1 <= JOB <= 5. - * - * n (input) int - * The order of the matrix. - * - * nnz (input) int - * The number of nonzeros in the matrix. - * - * adjncy (input) int*, of size nnz - * The adjacency structure of the matrix, which contains the row - * indices of the nonzeros. - * - * colptr (input) int*, of size n+1 - * The pointers to the beginning of each column in ADJNCY. - * - * nzval (input) double*, of size nnz - * The nonzero values of the matrix. nzval[k] is the value of - * the entry corresponding to adjncy[k]. - * It is not used if job = 1. - * - * perm (output) int*, of size n - * The permutation vector. perm[i] = j means row i in the - * original matrix is in row j of the permuted matrix. - * - * u (output) double*, of size n - * If job = 5, the natural logarithms of the row scaling factors. - * - * v (output) double*, of size n - * If job = 5, the natural logarithms of the column scaling factors. - * The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j). - */ - - int_t i, liw, ldw, num; - int_t *iw, icntl[10], info[10]; - double *dw; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Enter dldperm()"); -#endif - liw = 5*n; - if ( job == 3 ) liw = 10*n + nnz; - if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]"); - ldw = 3*n + nnz; - if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]"); - - /* Increment one to get 1-based indexing. */ - for (i = 0; i <= n; ++i) ++colptr[i]; - for (i = 0; i < nnz; ++i) ++adjncy[i]; -#if ( DEBUGlevel>=2 ) - printf("LDPERM(): n %d, nnz %d\n", n, nnz); - PrintInt10("colptr", n+1, colptr); - PrintInt10("adjncy", nnz, adjncy); -#endif - - /* - * NOTE: - * ===== - * - * MC64AD assumes that column permutation vector is defined as: - * perm(i) = j means column i of permuted A is in column j of original A. - * - * Since a symmetric permutation preserves the diagonal entries. Then - * by the following relation: - * P'(A*P')P = P'A - * we can apply inverse(perm) to rows of A to get large diagonal entries. - * But, since 'perm' defined in MC64AD happens to be the reverse of - * SuperLU's definition of permutation vector, therefore, it is already - * an inverse for our purpose. We will thus use it directly. - * - */ - mc64id_(icntl); -#if 0 - /* Suppress error and warning messages. */ - icntl[0] = -1; - icntl[1] = -1; -#endif - - mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval, &num, perm, - &liw, iw, &ldw, dw, icntl, info); - -#if ( DEBUGlevel>=2 ) - PrintInt10("perm", n, perm); - printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); -#endif - if ( info[0] == 1 ) { /* Structurally singular */ - printf(".. The last %d permutations:\n", n-num); - PrintInt10("perm", n-num, &perm[num]); - } - - /* Restore to 0-based indexing. */ - for (i = 0; i <= n; ++i) --colptr[i]; - for (i = 0; i < nnz; ++i) --adjncy[i]; - for (i = 0; i < n; ++i) --perm[i]; - - if ( job == 5 ) - for (i = 0; i < n; ++i) { - u[i] = dw[i]; - v[i] = dw[n+i]; - } - - SUPERLU_FREE(iw); - SUPERLU_FREE(dw); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Exit dldperm()"); -#endif -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,146 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - - -/* Variables external to this file */ -extern LU_stack_t stack; - - -void *duser_malloc_dist(int_t bytes, int_t which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - - -void duser_free_dist(int_t bytes, int_t which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - mem_usage_t *mem_usage) -{ - register int_t dword, gb, iword, k, maxsup, nb, nsupers; - int_t *index, *xsup; - int iam, mycol, myrow; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - iword = sizeof(int_t); - dword = sizeof(double); - maxsup = sp_ienv_dist(3); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - mem_usage->for_lu = 0; - - /* For L factor */ - nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - for (k = 0; k < nb; ++k) { - gb = k * grid->npcol + mycol; /* Global block number. */ - if ( gb < nsupers ) { - index = Llu->Lrowind_bc_ptr[k]; - if ( index ) { - mem_usage->for_lu += (float) - ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword); - mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword); - } - } - } - - /* For U factor */ - nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */ - for (k = 0; k < nb; ++k) { - gb = k * grid->nprow + myrow; /* Global block number. */ - if ( gb < nsupers ) { - index = Llu->Ufstnz_br_ptr[k]; - if ( index ) { - mem_usage->for_lu += (float)(index[2] * iword); - mem_usage->for_lu += (float)(index[1] * dword); - } - } - } - - /* Working storage to support factorization */ - mem_usage->total = mem_usage->for_lu; - mem_usage->total += - (float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword + - ( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword ); - /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/ - mem_usage->total += - (float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword ); - mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword; - k = CEILING( nsupers, grid->nprow ); - mem_usage->total += (float)(2 * k * iword); - - return 0; -} /* dQuerySpace_dist */ - - -/* - * Allocate storage for original matrix A - */ -void -dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa) -{ - *a = (double *) doubleMalloc_dist(nnz); - *asub = (int_t *) intMalloc_dist(nnz); - *xa = (int_t *) intMalloc_dist(n+1); -} - - -double *doubleMalloc_dist(int_t n) -{ - double *buf; - buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); - return (buf); -} - -double *doubleCalloc_dist(int_t n) -{ - double *buf; - register int_t i; - double zero = 0.0; - buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double)); - if ( !buf ) return (buf); - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig 1970-01-01 00:00:00.000000000 +0000 @@ -1,146 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - - -/* Variables external to this file */ -extern LU_stack_t stack; - - -void *duser_malloc_dist(int_t bytes, int_t which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - - -void duser_free_dist(int_t bytes, int_t which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - mem_usage_t *mem_usage) -{ - register int_t dword, gb, iword, k, maxsup, nb, nsupers; - int_t *index, *xsup; - int iam, mycol, myrow; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - iword = sizeof(int_t); - dword = sizeof(double); - maxsup = sp_ienv_dist(3); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - mem_usage->for_lu = 0; - - /* For L factor */ - nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - for (k = 0; k < nb; ++k) { - gb = k * grid->npcol + mycol; /* Global block number. */ - if ( gb < nsupers ) { - index = Llu->Lrowind_bc_ptr[k]; - if ( index ) { - mem_usage->for_lu += (float) - ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword); - mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword); - } - } - } - - /* For U factor */ - nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */ - for (k = 0; k < nb; ++k) { - gb = k * grid->nprow + myrow; /* Global block number. */ - if ( gb < nsupers ) { - index = Llu->Ufstnz_br_ptr[k]; - if ( index ) { - mem_usage->for_lu += (float)(index[2] * iword); - mem_usage->for_lu += (float)(index[1] * dword); - } - } - } - - /* Working storage to support factorization */ - mem_usage->total = mem_usage->for_lu; - mem_usage->total += - (float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword + - ( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword ); - /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/ - mem_usage->total += - (float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword ); - mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword; - k = CEILING( nsupers, grid->nprow ); - mem_usage->total += (float)(2 * k * iword); - - return 0; -} /* dQuerySpace_dist */ - - -/* - * Allocate storage for original matrix A - */ -void -dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa) -{ - *a = (double *) doubleMalloc_dist(nnz); - *asub = (int_t *) intMalloc_dist(nnz); - *xa = (int_t *) intMalloc_dist(n+1); -} - - -double *doubleMalloc_dist(int_t n) -{ - double *buf; - buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); - return (buf); -} - -double *doubleCalloc_dist(int_t n) -{ - double *buf; - register int_t i; - double zero = 0.0; - buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); - if ( !buf ) return (buf); - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.patch hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.patch --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.patch 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -132c132 -< buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); ---- -> buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); -141c141 -< buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); ---- -> buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double)); diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmyblas2.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmyblas2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmyblas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmyblas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,225 +0,0 @@ - - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: dmyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void dlsolve ( int ldm, int ncol, double *M, double *rhs ) -{ - int k; - double x0, x1, x2, x3, x4, x5, x6, x7; - double *M0; - register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - - M0 = &M[0]; - - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - Mki4 = Mki3 + ldm + 1; - Mki5 = Mki4 + ldm + 1; - Mki6 = Mki5 + ldm + 1; - Mki7 = Mki6 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++; - x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++; - x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; - x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - rhs[++firstcol] = x4; - rhs[++firstcol] = x5; - rhs[++firstcol] = x6; - rhs[++firstcol] = x7; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++ - - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++ - x7 * *Mki7++; - - M0 += 8 * ldm + 8; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++; - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; - - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -dusolve ( - int ldm, /* in */ - int ncol, /* in */ - double *M, /* in */ - double *rhs /* modified */ -) -{ - double xj; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) - rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void dmatvec ( - int ldm, /* in -- leading dimension of M */ - int nrow, /* in */ - int ncol, /* in */ - double *M, /* in */ - double *vec, /* in */ - double *Mxvec /* in/out */ -) -{ - double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; - double *M0; - register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - int k; - - M0 = &M[0]; - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - Mki4 = Mki3 + ldm; - Mki5 = Mki4 + ldm; - Mki6 = Mki5 + ldm; - Mki7 = Mki6 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - vi4 = vec[firstcol++]; - vi5 = vec[firstcol++]; - vi6 = vec[firstcol++]; - vi7 = vec[firstcol++]; - - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ - + vi4 * *Mki4++ + vi5 * *Mki5++ - + vi6 * *Mki6++ + vi7 * *Mki7++; - - M0 += 8 * ldm; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ ; - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++; - - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dreadhb.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dreadhb.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dreadhb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dreadhb.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,371 +0,0 @@ -#include -#include -#include -#include "superlu_ddefs.h" - -/* - * Prototypes - */ -static void ReadVector(FILE *, int_t, int_t *, int_t, int_t); -static void dReadValues(FILE *, int_t, double *, int_t, int_t); -static void FormFullA(int_t, int_t *, double **, int_t **, int_t **); -static int dDumpLine(FILE *); -static int dParseIntFormat(char *, int_t *, int_t *); -static int dParseFloatFormat(char *, int_t *, int_t *); - - -void -dreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz, - double **nzval, int_t **rowind, int_t **colptr) -{ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * - * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format - * as described below. - * - * Line 1 (A72,A8) - * Col. 1 - 72 Title (TITLE) - * Col. 73 - 80 Key (KEY) - * - * Line 2 (5I14) - * Col. 1 - 14 Total number of lines excluding header (TOTCRD) - * Col. 15 - 28 Number of lines for pointers (PTRCRD) - * Col. 29 - 42 Number of lines for row (or variable) indices (INDCRD) - * Col. 43 - 56 Number of lines for numerical values (VALCRD) - * Col. 57 - 70 Number of lines for right-hand sides (RHSCRD) - * (including starting guesses and solution vectors - * if present) - * (zero indicates no right-hand side data is present) - * - * Line 3 (A3, 11X, 4I14) - * Col. 1 - 3 Matrix type (see below) (MXTYPE) - * Col. 15 - 28 Number of rows (or variables) (NROW) - * Col. 29 - 42 Number of columns (or elements) (NCOL) - * Col. 43 - 56 Number of row (or variable) indices (NNZERO) - * (equal to number of entries for assembled matrices) - * Col. 57 - 70 Number of elemental matrix entries (NELTVL) - * (zero in the case of assembled matrices) - * Line 4 (2A16, 2A20) - * Col. 1 - 16 Format for pointers (PTRFMT) - * Col. 17 - 32 Format for row (or variable) indices (INDFMT) - * Col. 33 - 52 Format for numerical values of coefficient matrix (VALFMT) - * Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) - * - * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present - * Col. 1 Right-hand side type: - * F for full storage or M for same format as matrix - * Col. 2 G if a starting vector(s) (Guess) is supplied. (RHSTYP) - * Col. 3 X if an exact solution vector(s) is supplied. - * Col. 15 - 28 Number of right-hand sides (NRHS) - * Col. 29 - 42 Number of row indices (NRHSIX) - * (ignored in case of unassembled matrices) - * - * The three character type field on line 3 describes the matrix type. - * The following table lists the permitted values for each of the three - * characters. As an example of the type field, RSA denotes that the matrix - * is real, symmetric, and assembled. - * - * First Character: - * R Real matrix - * C Complex matrix - * P Pattern only (no numerical values supplied) - * - * Second Character: - * S Symmetric - * U Unsymmetric - * H Hermitian - * Z Skew symmetric - * R Rectangular - * - * Third Character: - * A Assembled - * E Elemental matrices (unassembled) - * - */ - - register int_t i, numer_lines, rhscrd = 0; - int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize; - char buf[100], type[4]; - int_t sym; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Enter dreadhb_dist()"); -#endif - - /* Line 1 */ - fgets(buf, 100, fp); - /*dDumpLine(fp);*/ - /*if ( !iam ) fflush(stdout);*/ - - /* Line 2 */ - for (i=0; i<5; i++) { - fscanf(fp, "%14c", buf); buf[14] = 0; - tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ - if (i == 3) numer_lines = tmp; - if (i == 4 && tmp) rhscrd = tmp; - } - dDumpLine(fp); - - /* Line 3 */ - fscanf(fp, "%3c", type); - fscanf(fp, "%11c", buf); /* pad */ - type[3] = 0; -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf("Matrix type %s\n", type); -#endif - - fscanf(fp, "%14c", buf); *nrow = atoi(buf); /*sscanf(buf, "%d", nrow);*/ - fscanf(fp, "%14c", buf); *ncol = atoi(buf); /*sscanf(buf, "%d", ncol);*/ - fscanf(fp, "%14c", buf); *nonz = atoi(buf); /*sscanf(buf, "%d", nonz);*/ - fscanf(fp, "%14c", buf); tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ - - if (tmp != 0) - if ( !iam ) printf("This is not an assembled matrix!\n"); - if (*nrow != *ncol) - if ( !iam ) printf("Matrix is not square.\n"); - dDumpLine(fp); - - /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ - dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); - - /* Line 4: format statement */ - fscanf(fp, "%16c", buf); - dParseIntFormat(buf, &colnum, &colsize); - fscanf(fp, "%16c", buf); - dParseIntFormat(buf, &rownum, &rowsize); - fscanf(fp, "%20c", buf); - dParseFloatFormat(buf, &valnum, &valsize); - fscanf(fp, "%20c", buf); - dDumpLine(fp); - - /* Line 5: right-hand side */ - if ( rhscrd ) dDumpLine(fp); /* skip RHSFMT */ - -#if ( DEBUGlevel>=1 ) - if ( !iam ) { - printf("%d rows, %d nonzeros\n", *nrow, *nonz); - printf("colnum %d, colsize %d\n", colnum, colsize); - printf("rownum %d, rowsize %d\n", rownum, rowsize); - printf("valnum %d, valsize %d\n", valnum, valsize); - } -#endif - - ReadVector(fp, *ncol+1, *colptr, colnum, colsize); -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf("read colptr[%d] = %d\n", *ncol, (*colptr)[*ncol]); -#endif - ReadVector(fp, *nonz, *rowind, rownum, rowsize); -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf("read rowind[%d] = %d\n", *nonz-1, (*rowind)[*nonz-1]); -#endif - if ( numer_lines ) { - dReadValues(fp, *nonz, *nzval, valnum, valsize); -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf("read nzval[%d] = %e\n", *nonz-1, (*nzval)[*nonz-1]); -#endif - } - - sym = (type[1] == 'S' || type[1] == 's'); - if ( sym ) { - FormFullA(*ncol, nonz, nzval, rowind, colptr); - } - - /*if ( !iam ) fflush(stdout);*/ - fclose(fp); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Exit dreadhb_dist()"); -#endif -} - -/* Eat up the rest of the current line */ -static int dDumpLine(FILE *fp) -{ - register int c; - while ((c = fgetc(fp)) != '\n') ; - return 0; -} - -static int dParseIntFormat(char *buf, int_t *num, int_t *size) -{ - char *tmp; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - while (*tmp != 'I' && *tmp != 'i') ++tmp; - ++tmp; - *size = atoi(tmp); /*sscanf(tmp, "%d", size);*/ - return 0; -} - -static int dParseFloatFormat(char *buf, int_t *num, int_t *size) -{ - char *tmp, *period; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' - && *tmp != 'F' && *tmp != 'f') { - /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the - num picked up refers to P, which should be skipped. */ - if (*tmp=='p' || *tmp=='P') { - ++tmp; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - } else { - ++tmp; - } - } - ++tmp; - period = tmp; - while (*period != '.' && *period != ')') ++period ; - *period = '\0'; - *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ - - return 0; -} - -static void -ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize) -{ - register int_t i, j, item; - char tmp, buf[100]; - - i = 0; - while (i < n) { - fgets(buf, 100, fp); /* read a line at a time */ - for (j=0; jnrow != L->ncol || L->nrow < 0 ) *info = -4; - else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; - if ( *info ) { - i = -(*info); - xerbla_("sp_dtrsv_dist", &i); - return 0; - } - - Lstore = (SCformat *) L->Store; - Lval = (double *) Lstore->nzval; - Ustore = (NCformat *) U->Store; - Uval = (double *) Ustore->nzval; - solve_ops = 0; - - if ( !(work = doubleCalloc_dist(L->nrow)) ) - ABORT("Malloc fails for work in sp_dtrsv_dist()."); - - if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L)*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - nrow = nsupr - nsupc; - - solve_ops += nsupc * (nsupc - 1); - solve_ops += 2 * nrow * nsupc; - - if ( nsupc == 1 ) { - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { - irow = L_SUB(iptr); - ++luptr; - x[irow] -= x[fsupc] * Lval[luptr]; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#else - dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); - - dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1); -#endif /* _CRAY */ -#else - dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); - - dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], - &x[fsupc], &work[0] ); -#endif - - iptr = istart + nsupc; - for (i = 0; i < nrow; ++i, ++iptr) { - irow = L_SUB(iptr); - x[irow] -= work[i]; /* Scatter */ - work[i] = 0.0; - - } - } - } /* for k ... */ - - } else { - /* Form x := inv(U)*x */ - - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - x[fsupc] /= Lval[luptr]; - for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { - irow = U_SUB(i); - x[irow] -= x[fsupc] * Uval[i]; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("N", strlen("N")); - STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); -#endif -#else - dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); -#endif - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); - i++) { - irow = U_SUB(i); - x[irow] -= x[jcol] * Uval[i]; - } - } - } - } /* for k ... */ - - } - } else { /* Form x := inv(A')*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L')*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 2 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - x[jcol] -= x[irow] * Lval[i]; - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += nsupc * (nsupc - 1); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); -#endif -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L", "T", "U", - &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := inv(U')*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - x[jcol] -= x[irow] * Uval[i]; - } - } - - solve_ops += nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - x[fsupc] /= Lval[luptr]; - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); -#endif -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("U", "T", "N", - &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } - - /*SuperLUStat.ops[SOLVE] += solve_ops;*/ - SUPERLU_FREE(work); - return 0; -} - - - - -int -sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x, - int incx, double beta, double *y, int incy) -{ -/* Purpose - ======= - - sp_dgemv_dist() performs one of the matrix-vector operations - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - where alpha and beta are scalars, x and y are vectors and A is a - sparse A->nrow by A->ncol matrix. - - Parameters - ========== - - TRANS - (input) char* - On entry, TRANS specifies the operation to be performed as - follows: - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - ALPHA - (input) double - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = SLU_NC or SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. - In the future, more general A can be handled. - - X - (input) double*, array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - - INCX - (input) int - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - - BETA - (input) double - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - - Y - (output) double*, array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - updated vector y. - - INCY - (input) int - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - - ==== Sparse Level 2 Blas routine. -*/ - - /* Local variables */ - NCformat *Astore; - double *Aval; - int info; - double temp; - int lenx, leny, i, j, irow; - int iy, jx, jy, kx, ky; - int notran; - - notran = lsame_(trans, "N"); - Astore = (NCformat *) A->Store; - Aval = (double *) Astore->nzval; - - /* Test the input parameters */ - info = 0; - if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; - else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; - else if (incx == 0) info = 5; - else if (incy == 0) info = 8; - if (info != 0) { - xerbla_("sp_dgemv_dist ", &info); - return 0; - } - - /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.) - return 0; - - /* Set LENX and LENY, the lengths of the vectors x and y, and set - up the start points in X and Y. */ - if (lsame_(trans, "N")) { - lenx = A->ncol; - leny = A->nrow; - } else { - lenx = A->nrow; - leny = A->ncol; - } - if (incx > 0) kx = 0; - else kx = - (lenx - 1) * incx; - if (incy > 0) ky = 0; - else ky = - (leny - 1) * incy; - - /* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - /* First form y := beta*y. */ - if (beta != 1.) { - if (incy == 1) { - if (beta == 0.) - for (i = 0; i < leny; ++i) y[i] = 0.; - else - for (i = 0; i < leny; ++i) y[i] = beta * y[i]; - } else { - iy = ky; - if (beta == 0.) - for (i = 0; i < leny; ++i) { - y[iy] = 0.; - iy += incy; - } - else - for (i = 0; i < leny; ++i) { - y[iy] = beta * y[iy]; - iy += incy; - } - } - } - - if (alpha == 0.) return 0; - - if ( notran ) { - /* Form y := alpha*A*x + y. */ - jx = kx; - if (incy == 1) { - for (j = 0; j < A->ncol; ++j) { - if (x[jx] != 0.) { - temp = alpha * x[jx]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - y[irow] += temp * Aval[i]; - } - } - jx += incx; - } - } else { - ABORT("Not implemented."); - } - } else { - /* Form y := alpha*A'*x + y. */ - jy = ky; - if (incx == 1) { - for (j = 0; j < A->ncol; ++j) { - temp = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - temp += Aval[i] * x[irow]; - } - y[jy] += alpha * temp; - jy += incy; - } - } else { - ABORT("Not implemented."); - } - } - return 0; -} /* sp_dgemv_dist */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,118 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -/* - * File name: sp_blas3.c - * Purpose: Sparse BLAS3, using some dense BLAS3 operations. - */ - -#include "superlu_ddefs.h" - -int -sp_dgemm_dist(char *transa, char *transb, int m, int n, int k, - double alpha, SuperMatrix *A, double *b, int ldb, - double beta, double *c, int ldc) -{ -/* Purpose - ======= - - sp_d performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - (input) char* - On entry, TRANSA specifies the form of op( A ) to be used in - the matrix multiplication as follows: - TRANSA = 'N' or 'n', op( A ) = A. - TRANSA = 'T' or 't', op( A ) = A'. - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - Unchanged on exit. - - TRANSB - (input) char* - On entry, TRANSB specifies the form of op( B ) to be used in - the matrix multiplication as follows: - TRANSB = 'N' or 'n', op( B ) = B. - TRANSB = 'T' or 't', op( B ) = B'. - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - Unchanged on exit. - - M - (input) int - On entry, M specifies the number of rows of the matrix - op( A ) and of the matrix C. M must be at least zero. - Unchanged on exit. - - N - (input) int - On entry, N specifies the number of columns of the matrix - op( B ) and the number of columns of the matrix C. N must be - at least zero. - Unchanged on exit. - - K - (input) int - On entry, K specifies the number of columns of the matrix - op( A ) and the number of rows of the matrix op( B ). K must - be at least zero. - Unchanged on exit. - - ALPHA - (input) double - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = SLU_NC or SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. - In the future, more general A can be handled. - - B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - part of the array B must contain the matrix B, otherwise - the leading n by k part of the array B must contain the - matrix B. - Unchanged on exit. - - LDB - (input) int - On entry, LDB specifies the first dimension of B as declared - in the calling (sub) program. LDB must be at least max( 1, n ). - Unchanged on exit. - - BETA - (input) double - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then C need not be set on input. - - C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - contain the matrix C, except when beta is zero, in which - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - ( alpha*op( A )*B + beta*C ). - - LDC - (input) int - On entry, LDC specifies the first dimension of C as declared - in the calling (sub)program. LDC must be at least max(1,m). - Unchanged on exit. - - ==== Sparse Level 3 Blas routine. -*/ - int incx = 1, incy = 1; - int j; - - for (j = 0; j < n; ++j) { - sp_dgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dutil.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,609 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include "superlu_ddefs.h" - -void -dCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, - double *nzval, int_t *rowind, int_t *colptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NCformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = (NCformat *) A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->rowind = rowind; - Astore->colptr = colptr; -} - -void -dCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n, - int_t nnz_loc, int_t m_loc, int_t fst_row, - double *nzval, int_t *colind, int_t *rowptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NRformat_loc *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = (NRformat_loc *) A->Store; - Astore->nnz_loc = nnz_loc; - Astore->fst_row = fst_row; - Astore->m_loc = m_loc; - Astore->nzval = nzval; - Astore->colind = colind; - Astore->rowptr = rowptr; -} - -/* - * Convert a row compressed storage into a column compressed storage. - */ -void -dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, - double *a, int_t *colind, int_t *rowptr, - double **at, int_t **rowind, int_t **colptr) -{ - register int i, j, col, relpos; - int_t *marker; - - /* Allocate storage for another copy of the matrix. */ - *at = (double *) doubleMalloc_dist(nnz); - *rowind = intMalloc_dist(nnz); - *colptr = intMalloc_dist(n+1); - marker = intCalloc_dist(n); - - /* Get counts of each column of A, and set up column pointers */ - for (i = 0; i < m; ++i) - for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; - (*colptr)[0] = 0; - for (j = 0; j < n; ++j) { - (*colptr)[j+1] = (*colptr)[j] + marker[j]; - marker[j] = (*colptr)[j]; - } - - /* Transfer the matrix into the compressed column storage. */ - for (i = 0; i < m; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - col = colind[j]; - relpos = marker[col]; - (*rowind)[relpos] = i; - (*at)[relpos] = a[j]; - ++marker[col]; - } - } - - SUPERLU_FREE(marker); -} - -/* Copy matrix A into matrix B. */ -void -dCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore, *Bstore; - int ncol, nnz, i; - - B->Stype = A->Stype; - B->Dtype = A->Dtype; - B->Mtype = A->Mtype; - B->nrow = A->nrow;; - B->ncol = ncol = A->ncol; - Astore = (NCformat *) A->Store; - Bstore = (NCformat *) B->Store; - Bstore->nnz = nnz = Astore->nnz; - for (i = 0; i < nnz; ++i) - ((double *)Bstore->nzval)[i] = ((double *)Astore->nzval)[i]; - for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; - for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; -} - - -void dPrint_CompCol_Matrix_dist(SuperMatrix *A) -{ - NCformat *Astore; - register int i; - double *dp; - - printf("\nCompCol matrix: "); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (NCformat *) A->Store; - printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); - if ( (dp = (double *) Astore->nzval) != NULL ) { - printf("nzval:\n"); - for (i = 0; i < Astore->nnz; ++i) printf("%f ", dp[i]); - } - printf("\nrowind:\n"); - for (i = 0; i < Astore->nnz; ++i) printf("%d ", Astore->rowind[i]); - printf("\ncolptr:\n"); - for (i = 0; i <= A->ncol; ++i) printf("%d ", Astore->colptr[i]); - printf("\nend CompCol matrix.\n"); -} - -void dPrint_Dense_Matrix_dist(SuperMatrix *A) -{ - DNformat *Astore; - register int i; - double *dp; - - printf("\nDense matrix: "); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (DNformat *) A->Store; - dp = (double *) Astore->nzval; - printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,Astore->lda); - printf("\nnzval: "); - for (i = 0; i < A->nrow; ++i) printf("%f ", dp[i]); - printf("\nend Dense matrix.\n"); -} - -int dPrint_CompRowLoc_Matrix_dist(SuperMatrix *A) -{ - NRformat_loc *Astore; - int_t i, nnz_loc, m_loc; - double *dp; - - printf("\n==== CompRowLoc matrix: "); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (NRformat_loc *) A->Store; - printf("nrow %d, ncol %d\n", A->nrow,A->ncol); - nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; - printf("nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc, - Astore->fst_row); - PrintInt10("rowptr", m_loc+1, Astore->rowptr); - PrintInt10("colind", nnz_loc, Astore->colind); - if ( (dp = (double *) Astore->nzval) != NULL ) - PrintDouble5("nzval", nnz_loc, dp); - printf("==== end CompRowLoc matrix\n"); -} - -int file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A) -{ - NRformat_loc *Astore; - int_t i, nnz_loc, m_loc; - double *dp; - - fprintf(fp, "\n==== CompRowLoc matrix: "); - fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (NRformat_loc *) A->Store; - fprintf(fp, "nrow %d, ncol %d\n", A->nrow, A->ncol); - nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; - fprintf(fp, "nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc, - Astore->fst_row); - file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr); - file_PrintInt10(fp, "colind", nnz_loc, Astore->colind); - if ( (dp = (double *) Astore->nzval) != NULL ) - file_PrintDouble5(fp, "nzval", nnz_loc, dp); - fprintf(fp, "==== end CompRowLoc matrix\n"); -} - -void -dCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, double *x, - int_t ldx, Stype_t stype, Dtype_t dtype, - Mtype_t mtype) -{ - DNformat *Xstore; - - X->Stype = stype; - X->Dtype = dtype; - X->Mtype = mtype; - X->nrow = m; - X->ncol = n; - X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); - Xstore = (DNformat *) X->Store; - Xstore->lda = ldx; - Xstore->nzval = (double *) x; -} - -void -dCopy_Dense_Matrix_dist(int_t M, int_t N, double *X, int_t ldx, - double *Y, int_t ldy) -{ -/* - * Purpose - * ======= - * - * Copies a two-dimensional matrix X to another matrix Y. - */ - int i, j; - - for (j = 0; j < N; ++j) - for (i = 0; i < M; ++i) - Y[i + j*ldy] = X[i + j*ldx]; -} - -void -dCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, - double *nzval, int_t *nzval_colptr, - int_t *rowind, int_t *rowind_colptr, - int_t *col_to_sup, int_t *sup_to_col, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - SCformat *Lstore; - - L->Stype = stype; - L->Dtype = dtype; - L->Mtype = mtype; - L->nrow = m; - L->ncol = n; - L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); - if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); - Lstore = L->Store; - Lstore->nnz = nnz; - Lstore->nsuper = col_to_sup[n]; - Lstore->nzval = nzval; - Lstore->nzval_colptr = nzval_colptr; - Lstore->rowind = rowind; - Lstore->rowind_colptr = rowind_colptr; - Lstore->col_to_sup = col_to_sup; - Lstore->sup_to_col = sup_to_col; - -} - -void -dGenXtrue_dist(int_t n, int_t nrhs, double *x, int_t ldx) -{ - int i, j; - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) { - if ( i % 2 ) x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/ - else x[i + j*ldx] = 1.0; - } -} - -/* - * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's - */ -void -dFillRHS_dist(char *trans, int_t nrhs, double *x, int_t ldx, - SuperMatrix *A, double *rhs, int_t ldb) -{ - double one = 1.0; - double zero = 0.0; - - sp_dgemm_dist(trans, "N", A->nrow, nrhs, A->ncol, one, A, - x, ldx, zero, rhs, ldb); - -} - -/* - * Fills a double precision array with a given value. - */ -void -dfill_dist(double *a, int_t alen, double dval) -{ - register int_t i; - for (i = 0; i < alen; i++) a[i] = dval; -} - - - -/* - * Check the inf-norm of the error vector - */ -void dinf_norm_error_dist(int_t n, int_t nrhs, double *x, int_t ldx, - double *xtrue, int_t ldxtrue, - gridinfo_t *grid) -{ - double err, xnorm; - double *x_work, *xtrue_work; - int i, j; - - for (j = 0; j < nrhs; j++) { - x_work = &x[j*ldx]; - xtrue_work = &xtrue[j*ldxtrue]; - err = xnorm = 0.0; - for (i = 0; i < n; i++) { - err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); - xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); - } - err = err / xnorm; - printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err); - } -} - -void PrintDouble5(char *name, int_t len, double *x) -{ - register int_t i; - - printf("%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 5 == 0 ) printf("\n[%2d-%2d] ", i, i+4); - printf("%14e", x[i]); - } - printf("\n"); -} - -int file_PrintDouble5(FILE *fp, char *name, int_t len, double *x) -{ - register int_t i; - - fprintf(fp, "%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 5 == 0 ) fprintf(fp, "\n[%2d-%2d] ", i, i+4); - fprintf(fp, "%14e", x[i]); - } - fprintf(fp, "\n"); -} - -/* - * Print the blocks in the factored matrix L. - */ -void dPrintLblocks(int_t iam, int_t nsupers, gridinfo_t *grid, - Glu_persist_t *Glu_persist, LocalLU_t *Llu) -{ - register int_t c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb; - register int_t k, mycol, r; - int_t *xsup = Glu_persist->xsup; - int_t *index; - double *nzval; - - printf("\n(%d) L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam); - ncb = nsupers / grid->npcol; - extra = nsupers % grid->npcol; - mycol = MYCOL( iam, grid ); - if ( mycol < extra ) ++ncb; - for (lb = 0; lb < ncb; ++lb) { - index = Llu->Lrowind_bc_ptr[lb]; - if ( index ) { /* Not an empty column */ - nzval = Llu->Lnzval_bc_ptr[lb]; - nb = index[0]; - nsupr = index[1]; - gb = lb * grid->npcol + mycol; - nsupc = SuperSize( gb ); - printf("(%d) block column %d (local), # row blocks %d\n", - iam, lb, nb); - for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { - len = index[k+1]; - printf("(%d) row-block %d: block # %d\tlength %d\n", - iam, c, index[k], len); - PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]); - for (j = 0; j < nsupc; ++j) { - PrintDouble5("nzval", len, &nzval[r + j*nsupr]); - } - k += LB_DESCRIPTOR + len; - r += len; - } - } - printf("(%d)", iam); - PrintInt10("ToSendR[]", grid->npcol, Llu->ToSendR[lb]); - PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); - } - printf("nfrecvx %4d\n", Llu->nfrecvx); - k = CEILING( nsupers, grid->nprow ); - PrintInt10("fmod", k, Llu->fmod); - -} /* DPRINTLBLOCKS */ - - -/* - * Print the blocks in the factored matrix U. - */ -void dPrintUblocks(int_t iam, int_t nsupers, gridinfo_t *grid, - Glu_persist_t *Glu_persist, LocalLU_t *Llu) -{ - register int_t c, extra, jb, k, lb, len, nb, nrb, nsupc; - register int_t myrow, r; - int_t *xsup = Glu_persist->xsup; - int_t *index; - double *nzval; - - printf("\n(%d) U BLOCKS IN ROW-MAJOR ORDER -->\n", iam); - nrb = nsupers / grid->nprow; - extra = nsupers % grid->nprow; - myrow = MYROW( iam, grid ); - if ( myrow < extra ) ++nrb; - for (lb = 0; lb < nrb; ++lb) { - index = Llu->Ufstnz_br_ptr[lb]; - if ( index ) { /* Not an empty row */ - nzval = Llu->Unzval_br_ptr[lb]; - nb = index[0]; - printf("(%d) block row %d (local), # column blocks %d\n", - iam, lb, nb); - r = 0; - for (c = 0, k = BR_HEADER; c < nb; ++c) { - jb = index[k]; - len = index[k+1]; - printf("(%d) col-block %d: block # %d\tlength %d\n", - iam, c, jb, index[k+1]); - nsupc = SuperSize( jb ); - PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]); - PrintDouble5("nzval", len, &nzval[r]); - k += UB_DESCRIPTOR + nsupc; - r += len; - } - - printf("(%d) ToSendD[] %d\n", iam, Llu->ToSendD[lb]); - } - } -} /* DPRINTUBLOCKS */ - -int -dprint_gsmv_comm(FILE *fp, int_t m_loc, pdgsmv_comm_t *gsmv_comm, - gridinfo_t *grid) -{ - int_t procs = grid->nprow*grid->npcol; - fprintf(fp, "TotalIndSend %d\tTotalValSend %d\n", gsmv_comm->TotalIndSend, - gsmv_comm->TotalValSend); - file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start); - file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend); - file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv); - file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend); - file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv); - file_PrintInt10(fp, "SendCounts", procs, gsmv_comm->SendCounts); - file_PrintInt10(fp, "RecvCounts", procs, gsmv_comm->RecvCounts); -} - - -void -GenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist, - gridinfo_t *grid, double **xact, int *ldx, double **b, int *ldb) -{ - int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows, - nsupr, nsupers, rel; - int_t *supno, *xsup, *lxsup; - double *x, *bb; - NCformat *Astore; - double *Aval; - - n = A->ncol; - *ldb = 0; - supno = Glu_persist->supno; - xsup = Glu_persist->xsup; - nsupers = supno[n-1] + 1; - iam = grid->iam; - myrow = MYROW( iam, grid ); - Astore = (NCformat *) A->Store; - Aval = (double *) Astore->nzval; - lb = CEILING( nsupers, grid->nprow ) + 1; - if ( !(lxsup = intMalloc_dist(lb)) ) - ABORT("Malloc fails for lxsup[]."); - - lsup = 0; - nlrows = 0; - for (j = 0; j < nsupers; ++j) { - i = PROW( j, grid ); - if ( myrow == i ) { - nsupr = SuperSize( j ); - *ldb += nsupr; - lxsup[lsup++] = nlrows; - nlrows += nsupr; - } - } - *ldx = n; - if ( !(x = doubleMalloc_dist(((size_t)*ldx) * nrhs)) ) - ABORT("Malloc fails for x[]."); - if ( !(bb = doubleCalloc_dist(*ldb * nrhs)) ) - ABORT("Calloc fails for bb[]."); - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0; - - /* Form b = A*x. */ - for (j = 0; j < n; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - gb = supno[irow]; - gbrow = PROW( gb, grid ); - if ( myrow == gbrow ) { - rel = irow - xsup[gb]; - lb = LBi( gb, grid ); - bb[lxsup[lb] + rel] += Aval[i] * x[j]; - } - } - - /* Memory allocated but not freed: xact, b */ - *xact = x; - *b = bb; - - SUPERLU_FREE(lxsup); - -#if ( PRNTlevel>=2 ) - for (i = 0; i < grid->nprow*grid->npcol; ++i) { - if ( iam == i ) { - printf("\n(%d)\n", iam); - PrintDouble5("rhs", *ldb, *b); - } - MPI_Barrier( grid->comm ); - } -#endif - -} /* GENXTRUERHS */ - -/* g5.rua - b = A*x y = L\b - 0 1 1.0000 - 1 0 0.2500 - 2 1 1.0000 - 3 2 2.0000 - 4 1 1.7500 - 5 1 1.8917 - 6 0 1.1879 - 7 2 2.0000 - 8 2 2.0000 - 9 1 1.0000 - 10 1 1.7500 - 11 0 0 - 12 1 1.8750 - 13 2 2.0000 - 14 1 1.0000 - 15 0 0.2500 - 16 1 1.7667 - 17 0 0.6419 - 18 1 2.2504 - 19 0 1.1563 - 20 0 0.9069 - 21 0 1.4269 - 22 1 2.7510 - 23 1 2.2289 - 24 0 2.4332 - - g6.rua - b=A*x y=L\b - 0 0 0 - 1 1 1.0000 - 2 1 1.0000 - 3 2 2.5000 - 4 0 0 - 5 2 2.0000 - 6 1 1.0000 - 7 1 1.7500 - 8 1 1.0000 - 9 0 0.2500 - 10 0 0.5667 - 11 1 2.0787 - 12 0 0.8011 - 13 1 1.9838 - 14 1 1.0000 - 15 1 1.0000 - 16 2 2.5000 - 17 0 0.8571 - 18 0 0 - 19 1 1.0000 - 20 0 0.2500 - 21 1 1.0000 - 22 2 2.0000 - 23 1 1.7500 - 24 1 1.8917 - 25 0 1.1879 - 26 0 0.8011 - 27 1 1.9861 - 28 1 2.0199 - 29 0 1.3620 - 30 0 0.6136 - 31 1 2.3677 - 32 0 1.1011 - 33 0 1.5258 - 34 0 1.7628 - 35 0 2.1658 -*/ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/etree.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/etree.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/etree.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/etree.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,419 +0,0 @@ - -/* Elimination tree computation and layout routines */ - -#include -#include -#include "superlu_ddefs.h" - -/* - * Implementation of disjoint set union routines. - * Elements are integers in 0..n-1, and the - * names of the sets themselves are of type int. - * - * Calls are: - * initialize_disjoint_sets (n) initial call. - * s = make_set (i) returns a set containing only i. - * s = link (t, u) returns s = t union u, destroying t and u. - * s = find (i) return name of set containing i. - * finalize_disjoint_sets final call. - * - * This implementation uses path compression but not weighted union. - * See Tarjan's book for details. - * John Gilbert, CMI, 1987. - * - * Implemented path-halving by XL 7/5/95. - */ - - -static -int_t *mxCallocInt(int_t n) -{ - register int_t i; - int_t *buf; - - buf = (int_t *) SUPERLU_MALLOC( n * sizeof(int_t) ); - if ( buf ) - for (i = 0; i < n; i++) buf[i] = 0; - return (buf); -} - -static -void initialize_disjoint_sets ( - int_t n, - int_t **pp /* parent array for sets */ - ) -{ - if ( !( (*pp) = mxCallocInt(n)) ) - ABORT("mxCallocInit fails for pp[]"); -} - - -static -int_t make_set ( - int_t i, - int_t *pp /* parent array for sets */ - ) -{ - pp[i] = i; - return i; -} - - -static -int_t link ( - int_t s, - int_t t, - int_t *pp - ) -{ - pp[s] = t; - return t; -} - - -/* PATH HALVING */ -static -int_t find ( - int_t i, - int_t *pp - ) -{ - register int_t p, gp; - - p = pp[i]; - gp = pp[p]; - while (gp != p) { - pp[i] = gp; - i = gp; - p = pp[i]; - gp = pp[p]; - } - return (p); -} - -#if 0 -/* PATH COMPRESSION */ -static -int_t find ( - int_t i - ) -{ - if (pp[i] != i) - pp[i] = find (pp[i]); - return pp[i]; -} -#endif - -static -void finalize_disjoint_sets ( - int_t *pp - ) -{ - SUPERLU_FREE(pp); -} - -/* - * p = spsymetree (A); - * - * Find the elimination tree for symmetric matrix A. - * This uses Liu's algorithm, and runs in time O(nz*log n). - * - * Input: - * Square sparse matrix A. No check is made for symmetry; - * elements below and on the diagonal are ignored. - * Numeric values are ignored, so any explicit zeros are - * treated as nonzero. - * Output: - * Integer array of parents representing the etree, with n - * meaning a root of the elimination forest. - * Note: - * This routine uses only the upper triangle, while sparse - * Cholesky (as in spchol.c) uses only the lower. Matlab's - * dense Cholesky uses only the upper. This routine could - * be modified to use the lower triangle either by transposing - * the matrix or by traversing it by rows with auxiliary - * pointer and link arrays. - * - * John R. Gilbert, Xerox, 10 Dec 1990 - * Based on code by JRG dated 1987, 1988, and 1990. - * Modified by X.S. Li, November 1999. - */ - -/* - * Symmetric elimination tree - */ -int_t -sp_symetree_dist( - int_t *acolst, int_t *acolend, /* column starts and ends past 1 */ - int_t *arow, /* row indices of A */ - int_t n, /* dimension of A */ - int_t *parent /* parent in elim tree */ - ) -{ - int_t *root; /* root of subtee of etree */ - int_t rset, cset; - int_t row, col; - int_t rroot; - int_t p; - int_t *pp; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Enter sp_symetree()"); -#endif - - root = mxCallocInt (n); - initialize_disjoint_sets (n, &pp); - - for (col = 0; col < n; col++) { - cset = make_set (col, pp); - root[cset] = col; - parent[col] = n; /* Matlab */ - for (p = acolst[col]; p < acolend[col]; p++) { - row = arow[p]; - if (row >= col) continue; - rset = find (row, pp); - rroot = root[rset]; - if (rroot != col) { - parent[rroot] = col; - cset = link (cset, rset, pp); - root[cset] = col; - } - } - } - SUPERLU_FREE (root); - finalize_disjoint_sets (pp); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Exit sp_symetree()"); -#endif - return 0; -} /* SP_SYMETREE_DIST */ - - -/* - * Find the elimination tree for A'*A. - * This uses something similar to Liu's algorithm. - * It runs in time O(nz(A)*log n) and does not form A'*A. - * - * Input: - * Sparse matrix A. Numeric values are ignored, so any - * explicit zeros are treated as nonzero. - * Output: - * Integer array of parents representing the elimination - * tree of the symbolic product A'*A. Each vertex is a - * column of A, and nc means a root of the elimination forest. - * - * John R. Gilbert, Xerox, 10 Dec 1990 - * Based on code by JRG dated 1987, 1988, and 1990. - */ - -/* - * Nonsymmetric elimination tree - */ -int_t -sp_coletree_dist( - int_t *acolst, int_t *acolend, /* column start and end past 1 */ - int_t *arow, /* row indices of A */ - int_t nr, int_t nc, /* dimension of A */ - int_t *parent /* parent in elim tree */ - ) -{ - int_t *root; /* root of subtee of etree */ - int_t *firstcol; /* first nonzero col in each row*/ - int_t rset, cset; - int_t row, col; - int_t rroot; - int_t p; - int_t *pp; - -#if ( DEBUGlevel>=1 ) - int_t iam = 0; - CHECK_MALLOC(iam, "Enter sp_coletree()"); -#endif - - root = mxCallocInt (nc); - initialize_disjoint_sets (nc, &pp); - - /* Compute firstcol[row] = first nonzero column in row */ - - firstcol = mxCallocInt (nr); - for (row = 0; row < nr; firstcol[row++] = nc); - for (col = 0; col < nc; col++) - for (p = acolst[col]; p < acolend[col]; p++) { - row = arow[p]; - firstcol[row] = SUPERLU_MIN(firstcol[row], col); - } - - /* Compute etree by Liu's algorithm for symmetric matrices, - except use (firstcol[r],c) in place of an edge (r,c) of A. - Thus each row clique in A'*A is replaced by a star - centered at its first vertex, which has the same fill. */ - - for (col = 0; col < nc; col++) { - cset = make_set (col, pp); - root[cset] = col; - parent[col] = nc; /* Matlab */ - for (p = acolst[col]; p < acolend[col]; p++) { - row = firstcol[arow[p]]; - if (row >= col) continue; - rset = find (row, pp); - rroot = root[rset]; - if (rroot != col) { - parent[rroot] = col; - cset = link (cset, rset, pp); - root[cset] = col; - } - } - } - - SUPERLU_FREE (root); - SUPERLU_FREE (firstcol); - finalize_disjoint_sets (pp); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit sp_coletree()"); -#endif - return 0; -} /* SP_COLETREE_DIST */ - -/* - * q = TreePostorder_dist (n, p); - * - * Postorder a tree. - * Input: - * p is a vector of parent pointers for a forest whose - * vertices are the integers 0 to n-1; p[root]==n. - * Output: - * q is a vector indexed by 0..n-1 such that q[i] is the - * i-th vertex in a postorder numbering of the tree. - * - * ( 2/7/95 modified by X.Li: - * q is a vector indexed by 0:n-1 such that vertex i is the - * q[i]-th vertex in a postorder numbering of the tree. - * That is, this is the inverse of the previous q. ) - * - * In the child structure, lower-numbered children are represented - * first, so that a tree which is already numbered in postorder - * will not have its order changed. - * - * Written by John Gilbert, Xerox, 10 Dec 1990. - * Based on code written by John Gilbert at CMI in 1987. - */ - -static int_t *first_kid, *next_kid; /* Linked list of children. */ -static int_t *post, postnum; - -static -/* - * Depth-first search from vertex v. - */ -void etdfs ( - int_t v, - int_t first_kid[], - int_t next_kid[], - int_t post[], - int_t *postnum - ) -{ - int w; - - for (w = first_kid[v]; w != -1; w = next_kid[w]) { - etdfs (w, first_kid, next_kid, post, postnum); - } - /* post[postnum++] = v; in Matlab */ - post[v] = (*postnum)++; /* Modified by X. Li on 08/10/07 */ -} - - -static -/* - * Depth-first search from vertex n. - * No recursion. - */ -void nr_etdfs (int_t n, int_t *parent, - int_t *first_kid, int_t *next_kid, - int_t *post, int_t postnum) -{ - int_t current = n, first, next; - - while (postnum != n){ - - /* no kid for the current node */ - first = first_kid[current]; - - /* no first kid for the current node */ - if (first == -1){ - - /* numbering this node because it has no kid */ - post[current] = postnum++; - - /* looking for the next kid */ - next = next_kid[current]; - - while (next == -1){ - - /* no more kids : back to the parent node */ - current = parent[current]; - - /* numbering the parent node */ - post[current] = postnum++; - - /* get the next kid */ - next = next_kid[current]; - } - - /* stopping criterion */ - if (postnum==n+1) return; - - /* updating current node */ - current = next; - } - /* updating current node */ - else { - current = first; - } - } -} - -/* - * Post order a tree - */ -int_t *TreePostorder_dist( - int_t n, - int_t *parent - ) -{ - int_t v, dad; - int_t *first_kid, *next_kid, *post, postnum; - - /* Allocate storage for working arrays and results */ - if ( !(first_kid = mxCallocInt (n+1)) ) - ABORT("mxCallocInt fails for first_kid[]"); - if ( !(next_kid = mxCallocInt (n+1)) ) - ABORT("mxCallocInt fails for next_kid[]"); - if ( !(post = mxCallocInt (n+1)) ) - ABORT("mxCallocInt fails for post[]"); - - /* Set up structure describing children */ - for (v = 0; v <= n; first_kid[v++] = -1); - for (v = n-1; v >= 0; v--) { - dad = parent[v]; - next_kid[v] = first_kid[dad]; - first_kid[dad] = v; - } - - /* Depth-first search from dummy root vertex #n */ - postnum = 0; -#if 0 - /* recursion */ - etdfs (n, first_kid, next_kid, post, &postnum); -#else - /* no recursion */ - nr_etdfs(n, parent, first_kid, next_kid, post, postnum); -#endif - - SUPERLU_FREE(first_kid); - SUPERLU_FREE(next_kid); - return post; -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/GetDiagU.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/GetDiagU.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/GetDiagU.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/GetDiagU.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,101 +0,0 @@ -/* - * -- Auxiliary routine in distributed SuperLU (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * Xiaoye S. Li - * April 16, 2002 - * - */ - -#include "superlu_ddefs.h" - -void GetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *diagU) -{ - /* - * Purpose - * ======= - * - * GetDiagU extracts the main diagonal of matrix U of the LU factorization. - * - * Arguments - * ========= - * - * n (input) int - * Dimension of the matrix. - * - * LUstruct (input) LUstruct_t* - * The data structures to store the distributed L and U factors. - * see superlu_ddefs.h for its definition. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * - * diagU (output) double*, dimension (n) - * The main diagonal of matrix U. - * On exit, it is available on all processes. - * - * - * Note - * ==== - * - * The diagonal blocks of the L and U matrices are stored in the L - * data structures, and are on the diagonal processes of the - * 2D process grid. - * - * This routine is modified from gather_diag_to_all() in pdgstrs_Bglobal.c. - * - */ - int_t *xsup; - int iam, knsupc, pkk; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ - int_t i, j, jj, k, lk, lwork, nsupers, p; - int_t num_diag_procs, *diag_procs, *diag_len; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double *dblock, *dwork, *lusup; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); - if ( !(dwork = doubleMalloc_dist(jj)) ) ABORT("Malloc fails for dwork[]"); - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy diagonal into buffer dwork[]. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBj( k, grid ); - nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ - lusup = Llu->Lnzval_bc_ptr[lk]; - for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */ - dwork[lwork+i] = lusup[i*(nsupr+1)]; - lwork += knsupc; - } - MPI_Bcast( dwork, lwork, MPI_DOUBLE, pkk, grid->comm ); - } else { - MPI_Bcast( dwork, diag_len[p], MPI_DOUBLE, pkk, grid->comm ); - } - - /* Scatter dwork[] into global diagU vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - dblock = &diagU[FstBlockC( k )]; - for (i = 0; i < knsupc; ++i) dblock[i] = dwork[lwork+i]; - lwork += knsupc; - } - } /* for p = ... */ - - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - SUPERLU_FREE(dwork); -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,474 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley, - * November 1, 2007 - * Feburary 20, 2008 - */ - -#include "superlu_ddefs.h" - - -void -get_metis( - int_t n, /* dimension of matrix B */ - int_t bnz, /* number of nonzeros in matrix A. */ - int_t *b_colptr, /* column pointer of size n+1 for matrix B. */ - int_t *b_rowind, /* row indices of size bnz for matrix B. */ - int_t *perm_c /* out - the column permutation vector. */ - ) -{ -#define METISOPTIONS 8 - int ct, i, j, nm, numflag = 0; /* C-Style ordering */ - int metis_options[METISOPTIONS]; - int_t *perm, *iperm; - - metis_options[0] = 0; /* Use Defaults for now */ - perm = intMalloc_dist(n); - iperm = intMalloc_dist(n); - - nm = n; - -#ifdef USE_METIS - /* Call metis */ -#undef USEEND -#ifdef USEEND - METIS_EdgeND(&nm, b_colptr, b_rowind, &numflag, metis_options, - perm, iperm); -#else - METIS_NodeND(&nm, b_colptr, b_rowind, &numflag, metis_options, - perm, iperm); -#endif -#endif - - /* Copy the permutation vector into SuperLU data structure. */ - for (i = 0; i < n; ++i) perm_c[i] = iperm[i]; - - SUPERLU_FREE(b_colptr); - SUPERLU_FREE(b_rowind); - SUPERLU_FREE(perm); - SUPERLU_FREE(iperm); -} - -void -getata_dist( - const int_t m, /* number of rows in matrix A. */ - const int_t n, /* number of columns in matrix A. */ - const int_t nz, /* number of nonzeros in matrix A */ - int_t *colptr, /* column pointer of size n+1 for matrix A. */ - int_t *rowind, /* row indices of size nz for matrix A. */ - int_t *atanz, /* out - on exit, returns the actual number of - nonzeros in matrix A'*A. */ - int_t **ata_colptr, /* out - size n+1 */ - int_t **ata_rowind /* out - size *atanz */ - ) -{ -/* - * Purpose - * ======= - * - * Form the structure of A'*A. A is an m-by-n matrix in column oriented - * format represented by (colptr, rowind). The output A'*A is in column - * oriented format (symmetrically, also row oriented), represented by - * (ata_colptr, ata_rowind). - * - * This routine is modified from GETATA routine by Tim Davis. - * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2, - * i.e., the sum of the square of the row counts. - * - * Questions - * ========= - * o Do I need to withhold the *dense* rows? - * o How do I know the number of nonzeros in A'*A? - * - */ - register int_t i, j, k, col, num_nz, ti, trow; - int_t *marker, *b_colptr, *b_rowind; - int_t *t_colptr, *t_rowind; /* a column oriented form of T = A' */ - - if ( !(marker = (int_t*) SUPERLU_MALLOC( (SUPERLU_MAX(m,n)+1) * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for marker[]"); - if ( !(t_colptr = (int_t*) SUPERLU_MALLOC( (m+1) * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC t_colptr[]"); - if ( !(t_rowind = (int_t*) SUPERLU_MALLOC( nz * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for t_rowind[]"); - - - /* Get counts of each column of T, and set up column pointers */ - for (i = 0; i < m; ++i) marker[i] = 0; - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) - ++marker[rowind[i]]; - } - t_colptr[0] = 0; - for (i = 0; i < m; ++i) { - t_colptr[i+1] = t_colptr[i] + marker[i]; - marker[i] = t_colptr[i]; - } - - /* Transpose the matrix from A to T */ - for (j = 0; j < n; ++j) - for (i = colptr[j]; i < colptr[j+1]; ++i) { - col = rowind[i]; - t_rowind[marker[col]] = j; - ++marker[col]; - } - - - /* ---------------------------------------------------------------- - compute B = T * A, where column j of B is: - - Struct (B_*j) = UNION ( Struct (T_*k) ) - A_kj != 0 - - do not include the diagonal entry - - ( Partition A as: A = (A_*1, ..., A_*n) - Then B = T * A = (T * A_*1, ..., T * A_*n), where - T * A_*j = (T_*1, ..., T_*m) * A_*j. ) - ---------------------------------------------------------------- */ - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* First pass determines number of nonzeros in B */ - num_nz = 0; - for (j = 0; j < n; ++j) { - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - for (i = colptr[j]; i < colptr[j+1]; ++i) { - /* A_kj is nonzero, add pattern of column T_*k to B_*j */ - k = rowind[i]; - for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { - trow = t_rowind[ti]; - if ( marker[trow] != j ) { - marker[trow] = j; - num_nz++; - } - } - } - } - *atanz = num_nz; - - /* Allocate storage for A'*A */ - if ( !(*ata_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for ata_colptr[]"); - if ( *atanz ) { - if ( !(*ata_rowind = (int_t*)SUPERLU_MALLOC(*atanz*sizeof(int_t)) ) ) { - fprintf(stderr, ".. atanz = %ld\n", *atanz); - ABORT("SUPERLU_MALLOC fails for ata_rowind[]"); - } - } - b_colptr = *ata_colptr; /* aliasing */ - b_rowind = *ata_rowind; - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* Compute each column of B, one at a time */ - num_nz = 0; - for (j = 0; j < n; ++j) { - b_colptr[j] = num_nz; - - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - for (i = colptr[j]; i < colptr[j+1]; ++i) { - /* A_kj is nonzero, add pattern of column T_*k to B_*j */ - k = rowind[i]; - for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { - trow = t_rowind[ti]; - if ( marker[trow] != j ) { - marker[trow] = j; - b_rowind[num_nz++] = trow; - } - } - } - } - b_colptr[n] = num_nz; - - SUPERLU_FREE(marker); - SUPERLU_FREE(t_colptr); - SUPERLU_FREE(t_rowind); -} - - -void -at_plus_a_dist( - const int_t n, /* number of columns in matrix A. */ - const int_t nz, /* number of nonzeros in matrix A */ - int_t *colptr, /* column pointer of size n+1 for matrix A. */ - int_t *rowind, /* row indices of size nz for matrix A. */ - int_t *bnz, /* out - on exit, returns the actual number of - nonzeros in matrix A'+A. */ - int_t **b_colptr, /* out - size n+1 */ - int_t **b_rowind /* out - size *bnz */ - ) -{ -/* - * Purpose - * ======= - * - * Form the structure of A'+A. A is an n-by-n matrix in column oriented - * format represented by (colptr, rowind). The output A'+A is in column - * oriented format (symmetrically, also row oriented), represented by - * (b_colptr, b_rowind). - * - */ - register int_t i, j, k, col, num_nz; - int_t *t_colptr, *t_rowind; /* a column oriented form of T = A' */ - int_t *marker; - - if ( !(marker = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for marker[]"); - if ( !(t_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for t_colptr[]"); - if ( !(t_rowind = (int_t*) SUPERLU_MALLOC( nz * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails t_rowind[]"); - - - /* Get counts of each column of T, and set up column pointers */ - for (i = 0; i < n; ++i) marker[i] = 0; - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) - ++marker[rowind[i]]; - } - t_colptr[0] = 0; - for (i = 0; i < n; ++i) { - t_colptr[i+1] = t_colptr[i] + marker[i]; - marker[i] = t_colptr[i]; - } - - /* Transpose the matrix from A to T */ - for (j = 0; j < n; ++j) - for (i = colptr[j]; i < colptr[j+1]; ++i) { - col = rowind[i]; - t_rowind[marker[col]] = j; - ++marker[col]; - } - - - /* ---------------------------------------------------------------- - compute B = A + T, where column j of B is: - - Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k) - - do not include the diagonal entry - ---------------------------------------------------------------- */ - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* First pass determines number of nonzeros in B */ - num_nz = 0; - for (j = 0; j < n; ++j) { - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - /* Add pattern of column A_*k to B_*j */ - for (i = colptr[j]; i < colptr[j+1]; ++i) { - k = rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - ++num_nz; - } - } - - /* Add pattern of column T_*k to B_*j */ - for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { - k = t_rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - ++num_nz; - } - } - } - *bnz = num_nz; - - /* Allocate storage for A+A' */ - if ( !(*b_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for b_colptr[]"); - if ( *bnz ) { - if ( !(*b_rowind = (int_t*) SUPERLU_MALLOC( *bnz * sizeof(int_t)) ) ) - ABORT("SUPERLU_MALLOC fails for b_rowind[]"); - } - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* Compute each column of B, one at a time */ - num_nz = 0; - for (j = 0; j < n; ++j) { - (*b_colptr)[j] = num_nz; - - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - /* Add pattern of column A_*k to B_*j */ - for (i = colptr[j]; i < colptr[j+1]; ++i) { - k = rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - (*b_rowind)[num_nz++] = k; - } - } - - /* Add pattern of column T_*k to B_*j */ - for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { - k = t_rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - (*b_rowind)[num_nz++] = k; - } - } - } - (*b_colptr)[n] = num_nz; - - SUPERLU_FREE(marker); - SUPERLU_FREE(t_colptr); - SUPERLU_FREE(t_rowind); -} /* at_plus_a_dist */ - - -void -get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) -/* - * Purpose - * ======= - * - * GET_PERM_C_DIST obtains a permutation matrix Pc, by applying the multiple - * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A', - * or using approximate minimum degree column ordering by Davis et. al. - * The LU factorization of A*Pc tends to have less fill than the LU - * factorization of A. - * - * Arguments - * ========= - * - * ispec (input) colperm_t - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering (i.e., Pc = I) - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A - * = MMD_ATA: minimum degree ordering on structure of A'*A - * = METIS_AT_PLUS_A: MeTis on A'+A - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A - * can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. - * In the future, more general A can be handled. - * - * perm_c (output) int* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - */ -{ - NCformat *Astore = A->Store; - int_t m, n, bnz = 0, *b_colptr, *b_rowind, i; - int_t delta, maxint, nofsub, *invp; - int_t *dhead, *qsize, *llist, *marker; - double t, SuperLU_timer_(); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(pnum, "Enter get_perm_c_dist()"); -#endif - - m = A->nrow; - n = A->ncol; - - t = SuperLU_timer_(); - - switch ( ispec ) { - - case NATURAL: /* Natural ordering */ - for (i = 0; i < n; ++i) perm_c[i] = i; -#if ( PRNTlevel>=1 ) - if ( !pnum ) printf(".. Use natural column ordering\n"); -#endif - return; - - case MMD_AT_PLUS_A: /* Minimum degree ordering on A'+A */ - if ( m != n ) ABORT("Matrix is not square"); - at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); - t = SuperLU_timer_() - t; - /*printf("Form A'+A time = %8.3f\n", t);*/ -#if ( PRNTlevel>=1 ) - if ( !pnum ) printf(".. Use minimum degree ordering on A'+A.\n"); -#endif - break; - - case MMD_ATA: /* Minimum degree ordering on A'*A */ - getata_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); - t = SuperLU_timer_() - t; - /*printf("Form A'*A time = %8.3f\n", t);*/ -#if ( PRNTlevel>=1 ) - if ( !pnum ) printf(".. Use minimum degree ordering on A'*A\n"); -#endif - break; - - case METIS_AT_PLUS_A: /* METIS ordering on A'+A */ - if ( m != n ) ABORT("Matrix is not square"); - at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); - get_metis(n, bnz, b_colptr, b_rowind, perm_c); -#if ( PRNTlevel>=1 ) - if ( !pnum ) printf(".. Use METIS ordering on A'+A\n"); -#endif - return; - - default: - ABORT("Invalid ISPEC"); - } - - if ( bnz ) { - t = SuperLU_timer_(); - - /* Initialize and allocate storage for GENMMD. */ - delta = 0; /* DELTA is a parameter to allow the choice of nodes - whose degree <= min-degree + DELTA. */ - maxint = 2147483647; /* 2**31 - 1 */ - invp = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t)); - if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp."); - dhead = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t)); - if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead."); - qsize = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t)); - if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize."); - llist = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t)); - if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist."); - marker = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t)); - if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker."); - - /* Transform adjacency list into 1-based indexing required by GENMMD.*/ - for (i = 0; i <= n; ++i) ++b_colptr[i]; - for (i = 0; i < bnz; ++i) ++b_rowind[i]; - - genmmd_dist_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, - qsize, llist, marker, &maxint, &nofsub); - - /* Transform perm_c into 0-based indexing. */ - for (i = 0; i < n; ++i) --perm_c[i]; - - SUPERLU_FREE(invp); - SUPERLU_FREE(dhead); - SUPERLU_FREE(qsize); - SUPERLU_FREE(llist); - SUPERLU_FREE(marker); - SUPERLU_FREE(b_rowind); - - t = SuperLU_timer_() - t; - /* printf("call GENMMD time = %8.3f\n", t);*/ - - } else { /* Empty adjacency structure */ - for (i = 0; i < n; ++i) perm_c[i] = i; - } - - SUPERLU_FREE(b_colptr); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(pnum, "Exit get_perm_c_dist()"); -#endif -} /* get_perm_c_dist */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,898 +0,0 @@ - -/* - * -- Distributed symbolic factorization auxialiary routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 - * INRIA France - January 2004 - * Laura Grigori - * - * November 1, 2007 - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include -#include -#include "superlu_ddefs.h" - -/* - * Internal protypes - */ - -static float -a_plus_at_CompRow_loc -(int, int_t *, int, int_t *, int_t , int_t *, int_t *, - int, int_t *, int_t *, int_t **, int_t **, gridinfo_t *); - -float -get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c, - int nprocs_i, int noDomains, - int_t **sizes, int_t **fstVtxSep, - gridinfo_t *grid, MPI_Comm *metis_comm) -/* - * Purpose - * ======= - * - * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a - * graph partitioning algorithm to the symmetrized graph A+A'. The - * multilevel graph partitioning algorithm used is the - * ParMETIS_V3_NodeND routine available in the parallel graph - * partitioning package parMETIS. - * - * The number of independent sub-domains noDomains computed by this - * algorithm has to be a power of 2. Hence noDomains is the larger - * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow - * * npcol is the number of processors used in SuperLU_DIST. - * - * Arguments - * ========= - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Matrix A is distributed - * in NRformat_loc format. - * - * perm_r (input) int_t* - * Row permutation vector of size A->nrow, which defines the - * permutation matrix Pr; perm_r[i] = j means row i of A is in - * position j in Pr*A. - * - * perm_c (output) int_t* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * nprocs_i (input) int* - * Number of processors the input matrix is distributed on in a block - * row format. It corresponds to number of processors used in - * SuperLU_DIST. - * - * noDomains (input) int*, must be power of 2 - * Number of independent domains to be computed by the graph - * partitioning algorithm. ( noDomains <= nprocs_i ) - * - * sizes (output) int_t**, of size 2 * noDomains - * Returns pointer to an array containing the number of nodes - * for each sub-domain and each separator. Separators are stored - * from left to right. - * Memory for the array is allocated in this routine. - * - * fstVtxSep (output) int_t**, of size 2 * noDomains - * Returns pointer to an array containing first node for each - * sub-domain and each separator. - * Memory for the array is allocated in this routine. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the symbolic factorization. - * > 0, number of bytes allocated when out of memory. - * - */ -{ - NRformat_loc *Astore; - int iam, p; - int *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int; - int *options, numflag; - int_t m_loc, nnz_loc, fst_row; - int_t m, n, bnz, i, j; - int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes; - int_t *b_rowptr, *b_colind; - int_t *dist_order; - int *recvcnts, *displs; - /* first row index on each processor when the matrix is distributed - on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */ - int_t *vtxdist_i, *vtxdist_o; - int_t szSep, k, noNodes; - float apat_mem_l; /* memory used during the computation of the graph of A+A' */ - float mem; /* Memory used during this routine */ - MPI_Status status; - - /* Initialization. */ - MPI_Comm_rank (grid->comm, &iam); - n = A->ncol; - m = A->nrow; - if ( m != n ) ABORT("Matrix is not square"); - mem = 0.; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()"); -#endif - - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */ - m_loc = Astore->m_loc; /* number of rows local to this processor */ - fst_row = Astore->fst_row; /* global index of the first row */ - rowptr = Astore->rowptr; /* pointer to rows and column indices */ - colind = Astore->colind; - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n", - noDomains); -#endif - - numflag = 0; - /* determine first row on each processor */ - vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); - if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i."); - vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); - if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o."); - - MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t, - grid->comm); - vtxdist_i[nprocs_i] = m; - - if (noDomains == nprocs_i) { - /* keep the same distribution of A */ - for (p = 0; p <= nprocs_i; p++) - vtxdist_o[p] = vtxdist_i[p]; - } - else { - i = n / noDomains; - j = n % noDomains; - for (k = 0, p = 0; p < noDomains; p++) { - vtxdist_o[p] = k; - k += i; - if (p < j) k++; - } - /* The remaining non-participating processors get the same - first-row-number as the last processor. */ - for (p = noDomains; p <= nprocs_i; p++) - vtxdist_o[p] = k; - } - -#if ( DEBUGlevel>=2 ) - if (!iam) - PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o); -#endif - - /* Compute distributed A + A' */ - if ((apat_mem_l = - a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i, - n, rowptr, colind, noDomains, vtxdist_o, - &bnz, &b_rowptr, &b_colind, grid)) > 0) - return (apat_mem_l); - mem += -apat_mem_l; - - /* Initialize and allocate storage for parMetis. */ - (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); - if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes."); - l_sizes = *sizes; - (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); - if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep."); - l_fstVtxSep = *fstVtxSep; - m_loc = vtxdist_o[iam+1] - vtxdist_o[iam]; - - if ( iam < noDomains) - /* dist_order_int is the perm returned by parMetis, distributed */ - if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int)))) - ABORT("SUPERLU_MALLOC fails for dist_order_int."); - - /* ParMETIS represents the column pointers and row indices of * - * the input matrix using integers. When SuperLU_DIST uses * - * long int for the int_t type, then several supplementary * - * copies need to be performed in order to call ParMETIS. */ -#if defined (_LONGINT) - l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int)); - if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int."); - - /* Allocate storage */ - if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int)))) - ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]"); - for (i = 0; i <= m_loc; i++) - b_rowptr_int[i] = b_rowptr[i]; - SUPERLU_FREE (b_rowptr); - - if ( bnz ) { - if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int)))) - ABORT("SUPERLU_MALLOC fails for b_colind_int[]"); - for (i = 0; i < bnz; i++) - b_colind_int[i] = b_colind[i]; - SUPERLU_FREE (b_colind); - } - - if ( !(vtxdist_o_int = - (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int)))) - ABORT("SUPERLU_MALLOC fails for vtxdist_o_int."); - for (i = 0; i <= nprocs_i; i++) - vtxdist_o_int[i] = vtxdist_o[i]; - SUPERLU_FREE (vtxdist_o); - -#else /* Default */ - - vtxdist_o_int = vtxdist_o; - b_rowptr_int = b_rowptr; b_colind_int = b_colind; - l_sizes_int = l_sizes; - -#endif - - if ( iam < noDomains) { - options = (int *) SUPERLU_MALLOC(4 * sizeof(int)); - options[0] = 0; - options[1] = 0; - options[2] = 0; - options[3] = 1; - -#if 0 - ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, - &numflag, options, - dist_order_int, l_sizes_int, metis_comm); -#endif - } - - if (bnz) - SUPERLU_FREE (b_colind_int); - if ( iam < noDomains) { - SUPERLU_FREE (options); - } - SUPERLU_FREE (b_rowptr_int); - -#if defined (_LONGINT) - /* Copy data from dist_order_int to dist_order */ - if ( iam < noDomains) { - /* dist_order is the perm returned by parMetis, distributed */ - if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for dist_order."); - for (i = 0; i < m_loc; i++) - dist_order[i] = dist_order_int[i]; - SUPERLU_FREE(dist_order_int); - - for (i = 0; i < 2*noDomains; i++) - l_sizes[i] = l_sizes_int[i]; - SUPERLU_FREE(l_sizes_int); - } -#else - dist_order = dist_order_int; -#endif - - /* Allgatherv dist_order to get perm_c */ - if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) - ABORT ("SUPERLU_MALLOC fails for displs."); - if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) - ABORT ("SUPERLU_MALLOC fails for recvcnts."); - for (i = 0; i < nprocs_i; i++) - recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i]; - displs[0]=0; - for(i=1; i < nprocs_i; i++) - displs[i] = displs[i-1] + recvcnts[i-1]; - - MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, - mpi_int_t, grid->comm); - - if ( iam < noDomains) { - SUPERLU_FREE (dist_order); - } - SUPERLU_FREE (vtxdist_i); - SUPERLU_FREE (vtxdist_o_int); - SUPERLU_FREE (recvcnts); - SUPERLU_FREE (displs); - - /* send l_sizes to every processor p >= noDomains */ - if (!iam) - for (p = noDomains; p < nprocs_i; p++) - MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm); - if (noDomains <= iam && iam < nprocs_i) - MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm, - &status); - - /* Determine the first node in each separator, store it in l_fstVtxSep */ - for (j = 0; j < 2 * noDomains; j++) - l_fstVtxSep[j] = 0; - l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2]; - szSep = noDomains; - i = 0; - while (szSep != 1) { - for (j = i; j < i + szSep; j++) { - l_fstVtxSep[j] += l_sizes[j]; - } - for (j = i; j < i + szSep; j++) { - k = i + szSep + (j-i) / 2; - l_fstVtxSep[k] += l_fstVtxSep[j]; - } - i += szSep; - szSep = szSep / 2; - } - - l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2]; - i = 2 * noDomains - 2; - szSep = 1; - while (i > 0) { - for (j = i; j < i + szSep; j++) { - k = (i - 2 * szSep) + (j-i) * 2 + 1; - noNodes = l_fstVtxSep[k]; - l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k]; - l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - - noNodes - l_sizes[k-1]; - } - szSep *= 2; - i -= szSep; - } - -#if ( PRNTlevel>=2 ) - if (!iam ) { - PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes); - PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep); - } -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()"); -#endif - - return (-mem); - -} /* get_perm_c_parmetis */ - - -static float -a_plus_at_CompRow_loc -( - int iam, /* Input - my processor number */ - int_t *perm_r, /* Input - row permutation vector Pr */ - int nprocs_i, /* Input - number of processors the input matrix - is distributed on */ - int_t *vtxdist_i, /* Input - index of first row on each processor of the input matrix */ - int_t n, /* Input - number of columns in matrix A. */ - int_t *rowptr, /* Input - row pointers of size m_loc+1 for matrix A. */ - int_t *colind, /* Input - column indices of size nnz_loc for matrix A. */ - int nprocs_o, /* Input - number of processors the output matrix - is distributed on */ - int_t *vtxdist_o, /* Input - index of first row on each processor of the output matrix */ - int_t *p_bnz, /* Output - on exit, returns the actual number of - local nonzeros in matrix A'+A. */ - int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */ - int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */ - gridinfo_t *grid /* Input - grid of processors information */ - ) -{ -/* - * Purpose - * ======= - * - * Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in - * NRformat_loc format, represented by (rowptr, colind). The output - * B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row - * oriented), represented by (b_rowptr, b_colind). - * - * The input matrix A is distributed in block row format on nprocs_i - * processors. The output matrix B is distributed in block row format - * on nprocs_o processors, where nprocs_o <= nprocs_i. On output, the - * matrix B has its rows permuted according to perm_r. - * - * Sketch of the algorithm - * ======================= - * - * Let iam by my process number. Let fst_row, lst_row = m_loc + - * fst_row be the first/last row stored on iam. - * - * Compute Pr' - the inverse row permutation, stored in iperm_r. - * - * Compute the transpose of the block row of Pr*A that iam owns: - * T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr' - * - * - * All to all communication such that every processor iam receives all - * the blocks of the transpose matrix that it needs, that is - * T[fst_row:lst_row, :] - * - * Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :] - * - * If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that - * is compute Pr*B) and redistribute from nprocs_i to nprocs_o - * according to the block row distribution in vtxdist_i, vtxdist_o. - */ - - int_t i, j, k, col, num_nz, nprocs; - int_t *tcolind_recv; /* temporary receive buffer */ - int_t *tcolind_send; /* temporary send buffer */ - int_t sz_tcolind_send, sz_tcolind_loc, sz_tcolind_recv; - int_t ind, ind_tmp, ind_rcv; - int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */ - int_t *marker, *iperm_r; - int_t *sendCnts, *recvCnts; - int_t *sdispls, *rdispls; - int_t bnz, *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t; - int_t p, t_ind, nelts, ipcol; - int_t m_loc, m_loc_o; /* number of local rows */ - int_t fst_row, fst_row_o; /* index of first local row */ - int_t nnz_loc; /* number of local nonzeros in matrix A */ - float apat_mem, apat_mem_max; - int *intBuf1, *intBuf2, *intBuf3, *intBuf4; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()"); -#endif - - fst_row = vtxdist_i[iam]; - m_loc = vtxdist_i[iam+1] - vtxdist_i[iam]; - nnz_loc = rowptr[m_loc]; - redist_pra = FALSE; - nprocs = SUPERLU_MAX(nprocs_i, nprocs_o); - apat_mem_max = 0.; - - if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for marker[]"); - if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for iperm_r[]"); - if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for sendCnts[]"); - if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for recvCnts[]"); - if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for sdispls[]"); - if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for rdispls[]"); - apat_mem = 2 * n + 4 * nprocs + 3; - -#if defined (_LONGINT) - intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int)); - intBuf2 = intBuf1 + nprocs; - intBuf3 = intBuf1 + 2 * nprocs; - intBuf4 = intBuf1 + 3 * nprocs; - apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t); -#endif - - /* compute the inverse row permutation vector */ - for (i = 0; i < n; i++) { - marker[i] = 1; - if (perm_r[i] != i) - redist_pra = TRUE; - iperm_r[perm_r[i]] = i; - } - - /* TRANSPOSE LOCAL ROWS ON MY PROCESSOR iam. */ - /* THE RESULT IS STORED IN TCOLIND_SEND. */ - /* THIS COUNTS FOR TWO PASSES OF THE LOCAL MATRIX. */ - - /* First pass to get counts of each row of T, and set up column pointers */ - for (j = 0; j < m_loc; j++) { - for (i = rowptr[j]; i < rowptr[j+1]; i++){ - marker[iperm_r[colind[i]]]++; - } - } - /* determine number of elements to be sent to each processor */ - for (p = 0; p < nprocs_i; p++) { - sendCnts[p] = 0; - for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) - sendCnts[p] += marker[i]; - } - /* exchange send/receive counts information in between all processors */ - MPI_Alltoall (sendCnts, 1, mpi_int_t, - recvCnts, 1, mpi_int_t, grid->comm); - sendCnts[iam] = 0; - sz_tcolind_loc = recvCnts[iam]; - - for (i = 0, j = 0, p = 0; p < nprocs_i; p++) { - rdispls[p] = j; - j += recvCnts[p]; - sdispls[p] = i; - i += sendCnts[p]; - } - recvCnts[iam] = 0; - sz_tcolind_recv = j; - sz_tcolind_send = i; - - /* allocate memory to receive necessary blocks of transpose matrix T */ - if (sz_tcolind_recv) { - if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv - * sizeof(int_t) ))) - ABORT("SUPERLU_MALLOC fails tcolind_recv[]"); - apat_mem += sz_tcolind_recv; - } - /* allocate memory to send blocks of local transpose matrix T to other processors */ - if (sz_tcolind_send) { - if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) - * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for tcolind_send[]"); - apat_mem += sz_tcolind_send; - } - - /* Set up marker[] to point at the beginning of each row in the - send/receive buffer. For each row, we store first its number of - elements, and then the elements. */ - ind_rcv = rdispls[iam]; - for (p = 0; p < nprocs_i; p++) { - for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) { - nelts = marker[i] - 1; - if (p == iam) { - tcolind_recv[ind_rcv] = nelts; - marker[i] = ind_rcv + 1; - ind_rcv += nelts + 1; - } - else { - tcolind_send[sdispls[p]] = nelts; - marker[i] = sdispls[p] + 1; - sdispls[p] += nelts + 1; - } - } - } - /* reset sdispls vector */ - for (i = 0, p = 0; p < nprocs_i; p++) { - sdispls[p] = i; - i += sendCnts[p]; - } - /* Second pass of the local matrix A to copy data to be send */ - for (j = 0; j < m_loc; j++) - for (i = rowptr[j]; i < rowptr[j+1]; i++) { - col = colind[i]; - ipcol = iperm_r[col]; - if (ipcol >= fst_row && ipcol < fst_row + m_loc) /* local data */ - tcolind_recv[marker[ipcol]] = perm_r[j + fst_row]; - else /* remote */ - tcolind_send[marker[ipcol]] = perm_r[j + fst_row]; - marker[ipcol] ++; - } - sendCnts[iam] = 0; - recvCnts[iam] = 0; - -#if defined (_LONGINT) - for (p=0; p INT_MAX || sdispls[p] > INT_MAX || - recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) sendCnts[p]; - intBuf2[p] = (int) sdispls[p]; - intBuf3[p] = (int) recvCnts[p]; - intBuf4[p] = (int) rdispls[p]; - } -#else /* Default */ - intBuf1 = sendCnts; intBuf2 = sdispls; - intBuf3 = recvCnts; intBuf4 = rdispls; -#endif - - /* send/receive transpose matrix T */ - MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t, - tcolind_recv, intBuf3, intBuf4, mpi_int_t, - grid->comm); - /* ------------------------------------------------------------ - DEALLOCATE SEND COMMUNICATION STORAGE - ------------------------------------------------------------*/ - if (sz_tcolind_send) { - SUPERLU_FREE( tcolind_send ); - apat_mem_max = apat_mem; - apat_mem -= sz_tcolind_send; - } - - /* ---------------------------------------------------------------- - FOR LOCAL ROWS: - compute B = A + T, where row j of B is: - Struct (B(j,:)) = Struct (A(j,:)) UNION Struct (T(j,:)) - do not include the diagonal entry - THIS COUNTS FOR TWO PASSES OF THE LOCAL ROWS OF A AND T. - ------------------------------------------------------------------ */ - - /* Reset marker to EMPTY */ - for (i = 0; i < n; ++i) marker[i] = EMPTY; - /* save rdispls information */ - for (p = 0; p < nprocs_i; p++) - sdispls[p] = rdispls[p]; - - /* First pass determines number of nonzeros in B */ - num_nz = 0; - for (j = 0; j < m_loc; j++) { - /* Flag the diagonal so it's not included in the B matrix */ - marker[perm_r[j + fst_row]] = j; - - /* Add pattern of row A(j,:) to B(j,:) */ - for (i = rowptr[j]; i < rowptr[j+1]; i++) { - k = colind[i]; - if ( marker[k] != j ) { - marker[k] = j; - ++num_nz; - } - } - - /* Add pattern of row T(j,:) to B(j,:) */ - for (p = 0; p < nprocs_i; p++) { - t_ind = rdispls[p]; - nelts = tcolind_recv[t_ind]; t_ind ++; - for (i = t_ind; i < t_ind + nelts; i++) { - k = tcolind_recv[i]; - if ( marker[k] != j ) { - marker[k] = j; - ++num_nz; - } - } - t_ind += nelts; - rdispls[p] = t_ind; - } - } - bnz_t = num_nz; - - /* Allocate storage for B=Pr*A+A'*Pr' */ - if ( !(b_rowptr_t = (int_t*) SUPERLU_MALLOC((m_loc+1) * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for b_rowptr_t[]"); - if ( bnz_t ) { - if ( !(b_colind_t = (int_t*) SUPERLU_MALLOC( bnz_t * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for b_colind_t[]"); - } - apat_mem += m_loc + 1 + bnz_t; - if (apat_mem > apat_mem_max) - apat_mem_max = apat_mem; - - /* Reset marker to EMPTY */ - for (i = 0; i < n; i++) marker[i] = EMPTY; - /* restore rdispls information */ - for (p = 0; p < nprocs_i; p++) - rdispls[p] = sdispls[p]; - - /* Second pass, compute each row of B, one at a time */ - num_nz = 0; - t_ind = 0; - for (j = 0; j < m_loc; j++) { - b_rowptr_t[j] = num_nz; - - /* Flag the diagonal so it's not included in the B matrix */ - marker[perm_r[j + fst_row]] = j; - - /* Add pattern of row A(j,:) to B(j,:) */ - for (i = rowptr[j]; i < rowptr[j+1]; i++) { - k = colind[i]; - if ( marker[k] != j ) { - marker[k] = j; - b_colind_t[num_nz] = k; num_nz ++; - } - } - - /* Add pattern of row T(j,:) to B(j,:) */ - for (p = 0; p < nprocs_i; p++) { - t_ind = rdispls[p]; - nelts = tcolind_recv[t_ind]; t_ind++; - for (i = t_ind; i < t_ind + nelts; i++) { - k = tcolind_recv[i]; - if ( marker[k] != j ) { - marker[k] = j; - b_colind_t[num_nz] = k; num_nz++; - } - } - t_ind += nelts; - rdispls[p] = t_ind; - } - } - b_rowptr_t[m_loc] = num_nz; - - for (p = 0; p <= SUPERLU_MIN(nprocs_i, nprocs_o); p++) - if (vtxdist_i[p] != vtxdist_o[p]) - redist_pra = TRUE; - - if (sz_tcolind_recv) { - SUPERLU_FREE (tcolind_recv); - apat_mem -= sz_tcolind_recv; - } - SUPERLU_FREE (marker); - SUPERLU_FREE (iperm_r); - apat_mem -= 2 * n + 1; - - /* redistribute permuted matrix (by rows) from nproc_i processors - to nproc_o processors */ - if (redist_pra) { - m_loc_o = vtxdist_o[iam+1] - vtxdist_o[iam]; - fst_row_o = vtxdist_o[iam]; - nnz_loc = 0; - - if ( !(b_rowptr = intMalloc_dist(m_loc_o + 1)) ) - ABORT("Malloc fails for *b_rowptr[]."); - apat_mem += m_loc_o + 1; - if (apat_mem > apat_mem_max) - apat_mem_max = apat_mem; - - for (p = 0; p < nprocs_i; p++) { - sendCnts[p] = 0; - recvCnts[p] = 0; - } - - for (i = 0; i < m_loc; i++) { - k = perm_r[i+fst_row]; - /* find the processor to which row k belongs */ - j = FALSE; p = 0; - while (!j) { - if (vtxdist_o[p] <= k && k < vtxdist_o[p+1]) - j = TRUE; - else - p ++; - } - if (p == iam) { - b_rowptr[k-fst_row_o] = b_rowptr_t[i + 1] - b_rowptr_t[i]; - nnz_loc += b_rowptr[k-fst_row_o]; - } - else - sendCnts[p] += b_rowptr_t[i + 1] - b_rowptr_t[i] + 2; - } - /* exchange send/receive counts information in between all processors */ - MPI_Alltoall (sendCnts, 1, mpi_int_t, - recvCnts, 1, mpi_int_t, grid->comm); - - for (i = 0, j = 0, p = 0; p < nprocs_i; p++) { - rdispls[p] = j; - j += recvCnts[p]; - sdispls[p] = i; - i += sendCnts[p]; - } - rdispls[p] = j; - sdispls[p] = i; - sz_tcolind_recv = j; - sz_tcolind_send = i; - - /* allocate memory for local data */ - tcolind_recv = NULL; - tcolind_send = NULL; - if (sz_tcolind_recv) { - if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv - * sizeof(int_t) ))) - ABORT("SUPERLU_MALLOC fails tcolind_recv[]"); - apat_mem += sz_tcolind_recv; - } - /* allocate memory to receive necessary data */ - if (sz_tcolind_send) { - if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) - * sizeof(int_t)))) - ABORT("SUPERLU_MALLOC fails for tcolind_send[]"); - apat_mem += sz_tcolind_send; - } - if (apat_mem > apat_mem_max) - apat_mem_max = apat_mem; - - /* Copy data to be send */ - ind_rcv = rdispls[iam]; - for (i = 0; i < m_loc; i++) { - k = perm_r[i+fst_row]; - /* find the processor to which row k belongs */ - j = FALSE; p = 0; - while (!j) { - if (vtxdist_o[p] <= k && k < vtxdist_o[p+1]) - j = TRUE; - else - p ++; - } - if (p != iam) { /* remote */ - tcolind_send[sdispls[p]] = k; - tcolind_send[sdispls[p]+1] = b_rowptr_t[i+1] - b_rowptr_t[i]; - sdispls[p] += 2; - for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++) { - tcolind_send[sdispls[p]] = b_colind_t[j]; sdispls[p] ++; - } - } - } - - /* reset sdispls vector */ - for (i = 0, p = 0; p < nprocs_i; p++) { - sdispls[p] = i; - i += sendCnts[p]; - } - sendCnts[iam] = 0; - recvCnts[iam] = 0; - -#if defined (_LONGINT) - for (p=0; p INT_MAX || sdispls[p] > INT_MAX || - recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) sendCnts[p]; - intBuf2[p] = (int) sdispls[p]; - intBuf3[p] = (int) recvCnts[p]; - intBuf4[p] = (int) rdispls[p]; - } -#else /* Default */ - intBuf1 = sendCnts; intBuf2 = sdispls; - intBuf3 = recvCnts; intBuf4 = rdispls; -#endif - - /* send/receive permuted matrix T by rows */ - MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t, - tcolind_recv, intBuf3, intBuf4, mpi_int_t, - grid->comm); - /* ------------------------------------------------------------ - DEALLOCATE COMMUNICATION STORAGE - ------------------------------------------------------------*/ - if (sz_tcolind_send) { - SUPERLU_FREE( tcolind_send ); - apat_mem -= sz_tcolind_send; - } - - /* ------------------------------------------------------------ - STORE ROWS IN ASCENDING ORDER OF THEIR NUMBER - ------------------------------------------------------------*/ - for (p = 0; p < nprocs; p++) { - if (p != iam) { - i = rdispls[p]; - while (i < rdispls[p+1]) { - j = tcolind_recv[i]; - nelts = tcolind_recv[i+1]; - i += 2 + nelts; - b_rowptr[j-fst_row_o] = nelts; - nnz_loc += nelts; - } - } - } - - if (nnz_loc) - if ( !(b_colind = intMalloc_dist(nnz_loc)) ) { - ABORT("Malloc fails for bcolind[]."); - apat_mem += nnz_loc; - if (apat_mem > apat_mem_max) - apat_mem_max = apat_mem; - } - - /* Initialize the array of row pointers */ - k = 0; - for (j = 0; j < m_loc_o; j++) { - i = b_rowptr[j]; - b_rowptr[j] = k; - k += i; - } - if (m_loc_o) b_rowptr[j] = k; - - /* Copy the data into the row oriented storage */ - for (p = 0; p < nprocs; p++) { - if (p != iam) { - i = rdispls[p]; - while (i < rdispls[p+1]) { - j = tcolind_recv[i]; - nelts = tcolind_recv[i+1]; - for (i += 2, k = b_rowptr[j-fst_row_o]; - k < b_rowptr[j-fst_row_o+1]; i++, k++) - b_colind[k] = tcolind_recv[i]; - } - } - } - for (i = 0; i < m_loc; i++) { - k = perm_r[i+fst_row]; - if (k >= vtxdist_o[iam] && k < vtxdist_o[iam+1]) { - ind = b_rowptr[k-fst_row_o]; - for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++, ind++) - b_colind[ind] = b_colind_t[j]; - } - } - - SUPERLU_FREE(b_rowptr_t); - if ( bnz_t ) - SUPERLU_FREE(b_colind_t); - if (sz_tcolind_recv) - SUPERLU_FREE(tcolind_recv); - apat_mem -= bnz_t + m_loc + sz_tcolind_recv; - - *p_bnz = nnz_loc; - *p_b_rowptr = b_rowptr; - *p_b_colind = b_colind; - } - else { - *p_bnz = bnz_t; - *p_b_rowptr = b_rowptr_t; - *p_b_colind = b_colind_t; - } - - SUPERLU_FREE (rdispls); - SUPERLU_FREE (sdispls); - SUPERLU_FREE (sendCnts); - SUPERLU_FREE (recvCnts); - apat_mem -= 4 * nprocs + 2; -#if defined (_LONGINT) - SUPERLU_FREE (intBuf1); - apat_mem -= 4*nprocs*sizeof(int) / sizeof(int_t); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit a_plus_at_CompRow_loc()"); -#endif - - return (- apat_mem_max * sizeof(int_t)); -} /* a_plus_at_CompRow_loc */ - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/hypre_interface.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/hypre_interface.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/hypre_interface.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/hypre_interface.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,6 +0,0 @@ -#include "fortran.h" -double hypre_F90_NAME(hy_dlamch,HY_DLAMCH)(char* in) -{ - return hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(in); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/lsame.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/lsame.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/lsame.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/lsame.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -#include "Cnames.h" - -int lsame_(char *ca, char *cb) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - Purpose - ======= - - LSAME returns .TRUE. if CA is the same letter as CB regardless of case. - - Arguments - ========= - - CA (input) CHARACTER*1 - CB (input) CHARACTER*1 - CA and CB specify the single characters to be compared. - - ===================================================================== -*/ - - /* System generated locals */ - int ret_val; - - /* Local variables */ - int inta, intb, zcode; - - ret_val = *(unsigned char *)ca == *(unsigned char *)cb; - if (ret_val) { - return ret_val; - } - - /* Now test for equivalence if both characters are alphabetic. */ - - zcode = 'Z'; - - /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime - machines, on which ICHAR returns a value with bit 8 set. - ICHAR('A') on Prime machines returns 193 which is the same as - ICHAR('A') on an EBCDIC machine. */ - - inta = *(unsigned char *)ca; - intb = *(unsigned char *)cb; - - if (zcode == 90 || zcode == 122) { - /* ASCII is assumed - ZCODE is the ASCII code of either lower or - upper case 'Z'. */ - if (inta >= 97 && inta <= 122) inta += -32; - if (intb >= 97 && intb <= 122) intb += -32; - - } else if (zcode == 233 || zcode == 169) { - /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or - upper case 'Z'. */ - if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta - >= 162 && inta <= 169) - inta += 64; - if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb - >= 162 && intb <= 169) - intb += 64; - } else if (zcode == 218 || zcode == 250) { - /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code - plus 128 of either lower or upper case 'Z'. */ - if (inta >= 225 && inta <= 250) inta += -32; - if (intb >= 225 && intb <= 250) intb += -32; - } - ret_val = inta == intb; - return ret_val; - -} /* lsame_ */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/machines.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/machines.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/machines.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/machines.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -/* - * -- SuperLU MT routine (version 1.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * August 15, 1997 - * - * These macros define which machine will be used. - */ - -#ifndef __SUPERLU_MACHINES /* allow multiple inclusions */ -#define __SUPERLU_MACHINES - -#define SGI 0 -#define ORIGIN 1 -#define DEC 2 -#define CRAY_T3E 3 -#define SUN 4 -#define PTHREAD 5 -#define IBM 6 - -#ifdef _SGI -#define MACH SGI -#endif - -#ifdef _ORIGIN -#define MACH ORIGIN -#endif - -#ifdef _DEC -#define MACH DEC -#endif - -#ifdef _CRAY -#define MACH CRAY_T3E -#endif - -#ifdef _SOLARIS -#define MACH SUN -#endif - -#ifdef _PTHREAD -#define MACH PTHREAD -#endif - -#if ( defined(_SP2) || defined(_SP) ) -#define MACH IBM -#endif - -#endif /* __SUPERLU_MACHINES */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,191 +0,0 @@ -####################################################################### -# -# This makefile creates a library for distributed SuperLU. -# The files are organized as follows: -# -# ALLAUX -- Auxiliary routines called from all precisions -# DSLUSRC -- Double precision real serial SuperLU routines -# DPLUSRC -- Double precision real parallel SuperLU routines -# ZSLUSRC -- Double precision complex serial SuperLU routines -# ZPLUSRC -- Double precision complex parallel SuperLU routines -# -# The library can be set up to include routines for any combination -# of the two precisions. To create or add to the library, enter make -# followed by one or more of the precisions desired. Some examples: -# make double -# make double complex16 -# Alternatively, the command -# make -# without any arguments creates a library of all two precisions. -# The library is called -# superlu.a -# and is created at the next higher directory level. -# -# To remove the object files after the library is created, enter -# make clean -# -####################################################################### -#include ../make.inc -# -include ../../../config/Makefile.config -CINCLUDES = ${INCLUDES} ${MPIINCLUDE} -C_COMPILE_FLAGS = \ - -I$(srcdir)\ - -I$(srcdir)/../../../\ - -I$(srcdir)/../../../utilities\ - ${CINCLUDES} - -# Precision independent routines -# -ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ - mmd.o comm.o memory.o util.o superlu_grid.o \ - pxerbla.o superlu_timer.o GetDiagU.o mc64ad.o mc21.o symbfact.o \ - psymbfact.o psymbfact_util.o \ - get_perm_c_parmetis.o - -#### LAPACK auxiliary routines -LAAUX = lsame.o xerbla.o slamch.o dlamch.o - -# -# Routines to permute large entries to diagonal -# -# DLDPERM = dldperm.o mc64ad.o mc21.o #fd05.o - -# -# Routines literally taken from SuperLU -# -DSLUSRC = dlangs.o dgsequ.o dlaqgs.o dutil.o \ - dmemory.o dmyblas2.o dsp_blas2.o dsp_blas3.o -ZSLUSRC = dcomplex.o zlangs.o zgsequ.o zlaqgs.o zutil.o \ - zmemory.o zmyblas2.o dmemory.o zsp_blas2.o zsp_blas3.o - -# -# Routines for double precision parallel SuperLU -DPLUSRC = dldperm.o ddistribute.o pdgstrf_irecv.o pdgstrs_Bglobal.o \ - pdgstrs1.o pdgssvx_ABglobal.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o\ - pdgssvx.o pdgstrs.o pddistribute.o pdlangs.o pdutil.o \ - pdgsequ.o pdlaqgs.o pdgsrfs.o pdgsmv.o pdgstrs_lsum.o \ - pdsymbfact_distdata.o - -# -# Routines for double complex parallel SuperLU -ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf_irecv.o \ - pzgstrs1.o pzgstrs_Bglobal.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ - pzgssvx.o pzgstrs.o pzdistribute.o pzlangs.o pzutil.o \ - pzgsequ.o pzlaqgs.o pzgsrfs.o pzgsmv.o pzgstrs_lsum.o \ - pzsymbfact_distdata.o - -####################################### -### changed for hypre -####################################### - -FILES =\ - dldperm.c \ - ddistribute.c \ - pdgstrf_irecv.c \ - pdgstrs_Bglobal.c \ - pdgstrs1.c \ - pdgssvx_ABglobal.c \ - pdgsrfs_ABXglobal.c \ - pdgsmv_AXglobal.c \ - pdgssvx.c \ - pdgstrs.c \ - pddistribute.c \ - pdlangs.c \ - pdutil.c \ - pdgsequ.c \ - pdlaqgs.c \ - pdgsrfs.c \ - pdgsmv.c \ - pdgstrs_lsum.c \ - pdsymbfact_distdata.c \ - get_perm_c_parmetis.o \ - sp_ienv.c \ - etree.c \ - sp_colorder.c \ - get_perm_c.c \ - mmd.c \ - comm.c \ - memory.c \ - util.c \ - superlu_grid.c \ - pxerbla.c \ - superlu_timer.c \ - GetDiagU.c \ - symbfact.c \ - psymbfact.c \ - psymbfact_util.c \ - lsame.c \ - xerbla.c \ - dlangs.c \ - dgsequ.c \ - dlaqgs.c \ - dutil.c \ - dmemory.c \ - slamch.c \ - dmyblas2.c \ - dsp_blas2.c \ - dsp_blas3.c \ - hypre_interface.c - -FILES2 =\ - mc64ad.f \ - mc21.f - -OBJS = ${FILES:.c=.o} ${FILES2:.f=.o} - -all: libHYPRE_dsuperlu${HYPRE_LIB_SUFFIX} - cp -fp *.h $(HYPRE_BUILD_DIR)/include -# cp -fp libHYPRE* $(HYPRE_BUILD_DIR)/lib - -install: libHYPRE_dsuperlu${HYPRE_LIB_SUFFIX} - cp -f *.h $(HYPRE_INC_INSTALL) -# cp -f libHYPRE* $(HYPRE_LIB_INSTALL) - @echo " " - -clean: - rm -f *.o libHYPRE* - rm -rf pchdir tca.map *inslog* -distclean: clean - -libHYPRE_dsuperlu.a: ${OBJS} - @echo "Building $@ ... " - ${AR} $@ ${OBJS} - ${RANLIB} $@ - -libHYPRE_dsuperlu.so: ${OBJS} - @echo "Building $@ ... " - ${BUILD_CC_SHARED} -o $@ ${OBJS} - -####################################### - -#all: double complex16 - -double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX) - $(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \ - $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX) - $(RANLIB) $(DSUPERLULIB) - -complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX) - $(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \ - $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX) - $(RANLIB) $(DSUPERLULIB) - - -################################## -# Do not optimize these routines # -################################## -slamch.o: slamch.c ; $(CC) $(CFLAGS) $(NOOPTS) $(CDEFS) -c $< -dlamch.o: dlamch.c ; $(CC) $(CFLAGS) $(NOOPTS) $(CDEFS) -c $< -################################## - -#.c.o: -# $(CC) $(CFLAGS) $(CDEFS) -DUSE_VENDOR_BLAS -c $< $(VERBOSE) - -#.f.o: -# f77 $(FFLAGS) -c $< $(VERBOSE) - -#clean: -# rm -f *.o $(DSUPERLULIB) - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu 1970-01-01 00:00:00.000000000 +0000 @@ -1,100 +0,0 @@ -####################################################################### -# -# This makefile creates a library for distributed SuperLU. -# The files are organized as follows: -# -# ALLAUX -- Auxiliary routines called from all precisions -# DSLUSRC -- Double precision real serial SuperLU routines -# DPLUSRC -- Double precision real parallel SuperLU routines -# ZSLUSRC -- Double precision complex serial SuperLU routines -# ZPLUSRC -- Double precision complex parallel SuperLU routines -# -# The library can be set up to include routines for any combination -# of the two precisions. To create or add to the library, enter make -# followed by one or more of the precisions desired. Some examples: -# make double -# make double complex16 -# Alternatively, the command -# make -# without any arguments creates a library of all two precisions. -# The library is called -# superlu.a -# and is created at the next higher directory level. -# -# To remove the object files after the library is created, enter -# make clean -# -####################################################################### -include ../make.inc -# -# Precision independent routines -# -ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ - mmd.o comm.o memory.o util.o superlu_grid.o \ - pxerbla.o superlu_timer.o GetDiagU.o mc64ad.o mc21.o symbfact.o \ - psymbfact.o psymbfact_util.o \ - get_perm_c_parmetis.o - -#### LAPACK auxiliary routines -LAAUX = lsame.o xerbla.o slamch.o dlamch.o - -# -# Routines to permute large entries to diagonal -# -# DLDPERM = dldperm.o mc64ad.o mc21.o #fd05.o - -# -# Routines literally taken from SuperLU -# -DSLUSRC = dlangs.o dgsequ.o dlaqgs.o dutil.o \ - dmemory.o dmyblas2.o dsp_blas2.o dsp_blas3.o -ZSLUSRC = dcomplex.o zlangs.o zgsequ.o zlaqgs.o zutil.o \ - zmemory.o zmyblas2.o dmemory.o zsp_blas2.o zsp_blas3.o - -# -# Routines for double precision parallel SuperLU -DPLUSRC = dldperm.o ddistribute.o pdgstrf_irecv.o pdgstrs_Bglobal.o \ - pdgstrs1.o pdgssvx_ABglobal.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o\ - pdgssvx.o pdgstrs.o pddistribute.o pdlangs.o pdutil.o \ - pdgsequ.o pdlaqgs.o pdgsrfs.o pdgsmv.o pdgstrs_lsum.o \ - pdsymbfact_distdata.o - -# -# Routines for double complex parallel SuperLU -ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf_irecv.o \ - pzgstrs1.o pzgstrs_Bglobal.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ - pzgssvx.o pzgstrs.o pzdistribute.o pzlangs.o pzutil.o \ - pzgsequ.o pzlaqgs.o pzgsrfs.o pzgsmv.o pzgstrs_lsum.o \ - pzsymbfact_distdata.o - -#all: double complex16 -all: double - -double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX) - $(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \ - $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX) - $(RANLIB) $(DSUPERLULIB) - -complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX) - $(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \ - $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX) - $(RANLIB) $(DSUPERLULIB) - - -################################## -# Do not optimize these routines # -################################## -slamch.o: slamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $< -dlamch.o: dlamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $< -################################## - -.c.o: - $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE) - -.f.o: - $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) - -clean: - rm -f *.o $(DSUPERLULIB) - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc21.f hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc21.f --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc21.f 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc21.f 1970-01-01 00:00:00.000000000 +0000 @@ -1,143 +0,0 @@ -*######DATE 8 Oct 1992 COPYRIGHT Rutherford Appleton Laboratory -C######8/10/92 Toolpack tool decs employed. -C######8/10/92 D version created by name change only. - SUBROUTINE MC21AD(N,ICN,LICN,IP,LENR,IPERM,NUMNZ,IW) -C .. Scalar Arguments .. - INTEGER LICN,N,NUMNZ -C .. -C .. Array Arguments .. - INTEGER ICN(LICN),IP(N),IPERM(N),IW(N,4),LENR(N) -C .. -C .. External Subroutines .. - EXTERNAL MC21BD -C .. -C .. Executable Statements .. - CALL MC21BD(N,ICN,LICN,IP,LENR,IPERM,NUMNZ,IW(1,1),IW(1,2), - + IW(1,3),IW(1,4)) - RETURN -C - END - SUBROUTINE MC21BD(N,ICN,LICN,IP,LENR,IPERM,NUMNZ,PR,ARP,CV,OUT) -C PR(I) IS THE PREVIOUS ROW TO I IN THE DEPTH FIRST SEARCH. -C IT IS USED AS A WORK ARRAY IN THE SORTING ALGORITHM. -C ELEMENTS (IPERM(I),I) I=1, ... N ARE NON-ZERO AT THE END OF THE -C ALGORITHM UNLESS N ASSIGNMENTS HAVE NOT BEEN MADE. IN WHICH CASE -C (IPERM(I),I) WILL BE ZERO FOR N-NUMNZ ENTRIES. -C CV(I) IS THE MOST RECENT ROW EXTENSION AT WHICH COLUMN I -C WAS VISITED. -C ARP(I) IS ONE LESS THAN THE NUMBER OF NON-ZEROS IN ROW I -C WHICH HAVE NOT BEEN SCANNED WHEN LOOKING FOR A CHEAP ASSIGNMENT. -C OUT(I) IS ONE LESS THAN THE NUMBER OF NON-ZEROS IN ROW I -C WHICH HAVE NOT BEEN SCANNED DURING ONE PASS THROUGH THE MAIN LOOP. -C -C INITIALIZATION OF ARRAYS. -C .. Scalar Arguments .. - INTEGER LICN,N,NUMNZ -C .. -C .. Array Arguments .. - INTEGER ARP(N),CV(N),ICN(LICN),IP(N),IPERM(N),LENR(N),OUT(N),PR(N) -C .. -C .. Local Scalars .. - INTEGER I,II,IN1,IN2,IOUTK,J,J1,JORD,K,KK -C .. -C .. Executable Statements .. - DO 10 I = 1,N - ARP(I) = LENR(I) - 1 - CV(I) = 0 - IPERM(I) = 0 - 10 CONTINUE - NUMNZ = 0 -C -C -C MAIN LOOP. -C EACH PASS ROUND THIS LOOP EITHER RESULTS IN A NEW ASSIGNMENT -C OR GIVES A ROW WITH NO ASSIGNMENT. - DO 100 JORD = 1,N - J = JORD - PR(J) = -1 - DO 70 K = 1,JORD -C LOOK FOR A CHEAP ASSIGNMENT - IN1 = ARP(J) - IF (IN1.LT.0) GO TO 30 - IN2 = IP(J) + LENR(J) - 1 - IN1 = IN2 - IN1 - DO 20 II = IN1,IN2 - I = ICN(II) - IF (IPERM(I).EQ.0) GO TO 80 - 20 CONTINUE -C NO CHEAP ASSIGNMENT IN ROW. - ARP(J) = -1 -C BEGIN LOOKING FOR ASSIGNMENT CHAIN STARTING WITH ROW J. - 30 CONTINUE - OUT(J) = LENR(J) - 1 -C INNER LOOP. EXTENDS CHAIN BY ONE OR BACKTRACKS. - DO 60 KK = 1,JORD - IN1 = OUT(J) - IF (IN1.LT.0) GO TO 50 - IN2 = IP(J) + LENR(J) - 1 - IN1 = IN2 - IN1 -C FORWARD SCAN. - DO 40 II = IN1,IN2 - I = ICN(II) - IF (CV(I).EQ.JORD) GO TO 40 -C COLUMN I HAS NOT YET BEEN ACCESSED DURING THIS PASS. - J1 = J - J = IPERM(I) - CV(I) = JORD - PR(J) = J1 - OUT(J1) = IN2 - II - 1 - GO TO 70 -C - 40 CONTINUE -C -C BACKTRACKING STEP. - 50 CONTINUE - J = PR(J) - IF (J.EQ.-1) GO TO 100 - 60 CONTINUE -C - 70 CONTINUE -C -C NEW ASSIGNMENT IS MADE. - 80 CONTINUE - IPERM(I) = J - ARP(J) = IN2 - II - 1 - NUMNZ = NUMNZ + 1 - DO 90 K = 1,JORD - J = PR(J) - IF (J.EQ.-1) GO TO 100 - II = IP(J) + LENR(J) - OUT(J) - 2 - I = ICN(II) - IPERM(I) = J - 90 CONTINUE -C - 100 CONTINUE -C -C IF MATRIX IS STRUCTURALLY SINGULAR, WE NOW COMPLETE THE -C PERMUTATION IPERM. - IF (NUMNZ.EQ.N) RETURN - DO 110 I = 1,N - ARP(I) = 0 - 110 CONTINUE - K = 0 - DO 130 I = 1,N - IF (IPERM(I).NE.0) GO TO 120 - K = K + 1 - OUT(K) = I - GO TO 130 -C - 120 CONTINUE - J = IPERM(I) - ARP(J) = I - 130 CONTINUE - K = 0 - DO 140 I = 1,N - IF (ARP(I).NE.0) GO TO 140 - K = K + 1 - IOUTK = OUT(K) - IPERM(IOUTK) = I - 140 CONTINUE - RETURN -C - END - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc64ad.f hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc64ad.f --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc64ad.f 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc64ad.f 1970-01-01 00:00:00.000000000 +0000 @@ -1,2002 +0,0 @@ -CCCCC COPYRIGHT (c) 1999 Council for the Central Laboratory of the -CCCCC Research Councils. All rights reserved. -CCCCC PACKAGE MC64A/AD -CCCCC AUTHORS Iain Duff (i.duff@rl.ac.uk) and Jacko Koster (jak@ii.uib.no) -CCCCC LAST UPDATE 20/09/99 -CCCCC -C *** Conditions on external use *** -C -C The user shall acknowledge the contribution of this -C package in any publication of material dependent upon the use of -C the package. The user shall use reasonable endeavours to notify -C the authors of the package of this publication. -C -C The user can modify this code but, at no time -C shall the right or title to all or any part of this package pass -C to the user. The user shall make available free of charge -C to the authors for any purpose all information relating to any -C alteration or addition made to this package for the purposes of -C extending the capabilities or enhancing the performance of this -C package. -C -C The user shall not pass this code directly to a third party without the -C express prior consent of the authors. Users wanting to licence their -C own copy of these routines should send email to hsl@aeat.co.uk -C -C None of the comments from the Copyright notice up to and including this -C one shall be removed or altered in any way. - -C********************************************************************** - SUBROUTINE MC64ID(ICNTL) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C -C Purpose -C ======= -C -C The components of the array ICNTL control the action of MC64A/AD. -C Default values for these are set in this subroutine. -C -C Parameters -C ========== -C - INTEGER ICNTL(10) -C -C Local variables - INTEGER I -C -C ICNTL(1) has default value 6. -C It is the output stream for error messages. If it -C is negative, these messages will be suppressed. -C -C ICNTL(2) has default value 6. -C It is the output stream for warning messages. -C If it is negative, these messages are suppressed. -C -C ICNTL(3) has default value -1. -C It is the output stream for monitoring printing. -C If it is negative, these messages are suppressed. -C -C ICNTL(4) has default value 0. -C If left at the defaut value, the incoming data is checked for -C out-of-range indices and duplicates. Setting ICNTL(4) to any -C other will avoid the checks but is likely to cause problems -C later if out-of-range indices or duplicates are present. -C The user should only set ICNTL(4) non-zero, if the data is -C known to avoid these problems. -C -C ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to -C zero in this routine. - -C Initialization of the ICNTL array. - ICNTL(1) = 6 - ICNTL(2) = 6 - ICNTL(3) = -1 - DO 10 I = 4,10 - ICNTL(I) = 0 - 10 CONTINUE - - RETURN - END - -C********************************************************************** - SUBROUTINE MC64AD(JOB,N,NE,IP,IRN,A,NUM,CPERM,LIW,IW,LDW,DW, - & ICNTL,INFO) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C -C Purpose -C ======= -C -C This subroutine attempts to find a column permutation for an NxN -C sparse matrix A = {a_ij} that makes the permuted matrix have N -C entries on its diagonal. -C If the matrix is structurally nonsingular, the subroutine optionally -C returns a column permutation that maximizes the smallest element -C on the diagonal, maximizes the sum of the diagonal entries, or -C maximizes the product of the diagonal entries of the permuted matrix. -C For the latter option, the subroutine also finds scaling factors -C that may be used to scale the matrix so that the nonzero diagonal -C entries of the permuted matrix are one in absolute value and all the -C off-diagonal entries are less than or equal to one in absolute value. -C The natural logarithms of the scaling factors u(i), i=1..N, for the -C rows and v(j), j=1..N, for the columns are returned so that the -C scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). -C -C Parameters -C ========== -C - INTEGER JOB,N,NE,NUM,LIW,LDW - INTEGER IP(N+1),IRN(NE),CPERM(N),IW(LIW),ICNTL(10),INFO(10) - DOUBLE PRECISION A(NE),DW(LDW) -C -C JOB is an INTEGER variable which must be set by the user to -C control the action. It is not altered by the subroutine. -C Possible values for JOB are: -C 1 Compute a column permutation of the matrix so that the -C permuted matrix has as many entries on its diagonal as possible. -C The values on the diagonal are of arbitrary size. HSL subroutine -C MC21A/AD is used for this. See [1]. -C 2 Compute a column permutation of the matrix so that the smallest -C value on the diagonal of the permuted matrix is maximized. -C See [3]. -C 3 Compute a column permutation of the matrix so that the smallest -C value on the diagonal of the permuted matrix is maximized. -C The algorithm differs from the one used for JOB = 2 and may -C have quite a different performance. See [2]. -C 4 Compute a column permutation of the matrix so that the sum -C of the diagonal entries of the permuted matrix is maximized. -C See [3]. -C 5 Compute a column permutation of the matrix so that the product -C of the diagonal entries of the permuted matrix is maximized -C and vectors to scale the matrix so that the nonzero diagonal -C entries of the permuted matrix are one in absolute value and -C all the off-diagonal entries are less than or equal to one in -C absolute value. See [3]. -C Restriction: 1 <= JOB <= 5. -C -C N is an INTEGER variable which must be set by the user to the -C order of the matrix A. It is not altered by the subroutine. -C Restriction: N >= 1. -C -C NE is an INTEGER variable which must be set by the user to the -C number of entries in the matrix. It is not altered by the -C subroutine. -C Restriction: NE >= 1. -C -C IP is an INTEGER array of length N+1. -C IP(J), J=1..N, must be set by the user to the position in array IRN -C of the first row index of an entry in column J. IP(N+1) must be set -C to NE+1. It is not altered by the subroutine. -C -C IRN is an INTEGER array of length NE. -C IRN(K), K=1..NE, must be set by the user to hold the row indices of -C the entries of the matrix. Those belonging to column J must be -C stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering -C of the row indices within each column is unimportant. Repeated -C entries are not allowed. The array IRN is not altered by the -C subroutine. -C -C A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. -C The user must set A(K), K=1..NE, to the numerical value of the -C entry that corresponds to IRN(K). -C It is not used by the subroutine when JOB = 1. -C It is not altered by the subroutine. -C -C NUM is an INTEGER variable that need not be set by the user. -C On successful exit, NUM will be the number of entries on the -C diagonal of the permuted matrix. -C If NUM < N, the matrix is structurally singular. -C -C CPERM is an INTEGER array of length N that need not be set by the -C user. On successful exit, CPERM contains the column permutation. -C Column CPERM(J) of the original matrix is column J in the permuted -C matrix, J=1..N. -C -C LIW is an INTEGER variable that must be set by the user to -C the dimension of array IW. It is not altered by the subroutine. -C Restriction: -C JOB = 1 : LIW >= 5N -C JOB = 2 : LIW >= 4N -C JOB = 3 : LIW >= 10N + NE -C JOB = 4 : LIW >= 5N -C JOB = 5 : LIW >= 5N -C -C IW is an INTEGER array of length LIW that is used for workspace. -C -C LDW is an INTEGER variable that must be set by the user to the -C dimension of array DW. It is not altered by the subroutine. -C Restriction: -C JOB = 1 : LDW is not used -C JOB = 2 : LDW >= N -C JOB = 3 : LDW >= NE -C JOB = 4 : LDW >= 2N + NE -C JOB = 5 : LDW >= 3N + NE -C -C DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW -C that is used for workspace. If JOB = 5, on return, -C DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. -C -C ICNTL is an INTEGER array of length 10. Its components control the -C output of MC64A/AD and must be set by the user before calling -C MC64A/AD. They are not altered by the subroutine. -C -C ICNTL(1) must be set to specify the output stream for -C error messages. If ICNTL(1) < 0, messages are suppressed. -C The default value set by MC46I/ID is 6. -C -C ICNTL(2) must be set by the user to specify the output stream for -C warning messages. If ICNTL(2) < 0, messages are suppressed. -C The default value set by MC46I/ID is 6. -C -C ICNTL(3) must be set by the user to specify the output stream for -C diagnostic messages. If ICNTL(3) < 0, messages are suppressed. -C The default value set by MC46I/ID is -1. -C -C ICNTL(4) must be set by the user to a value other than 0 to avoid -C checking of the input data. -C The default value set by MC46I/ID is 0. -C -C INFO is an INTEGER array of length 10 which need not be set by the -C user. INFO(1) is set non-negative to indicate success. A negative -C value is returned if an error occurred, a positive value if a -C warning occurred. INFO(2) holds further information on the error. -C On exit from the subroutine, INFO(1) will take one of the -C following values: -C 0 : successful entry (for structurally nonsingular matrix). -C +1 : successful entry (for structurally singular matrix). -C +2 : the returned scaling factors are large and may cause -C overflow when used to scale the matrix. -C (For JOB = 5 entry only.) -C -1 : JOB < 1 or JOB > 5. Value of JOB held in INFO(2). -C -2 : N < 1. Value of N held in INFO(2). -C -3 : NE < 1. Value of NE held in INFO(2). -C -4 : the defined length LIW violates the restriction on LIW. -C Value of LIW required given by INFO(2). -C -5 : the defined length LDW violates the restriction on LDW. -C Value of LDW required given by INFO(2). -C -6 : entries are found whose row indices are out of range. INFO(2) -C contains the index of a column in which such an entry is found. -C -7 : repeated entries are found. INFO(2) contains the index of a -C column in which such entries are found. -C INFO(3) to INFO(10) are not currently used and are set to zero by -C the routine. -C -C References: -C [1] I. S. Duff, (1981), -C "Algorithm 575. Permutations for a zero-free diagonal", -C ACM Trans. Math. Software 7(3), 387-390. -C [2] I. S. Duff and J. Koster, (1998), -C "The design and use of algorithms for permuting large -C entries to the diagonal of sparse matrices", -C SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. -C [3] I. S. Duff and J. Koster, (1999), -C "On algorithms for permuting large entries to the diagonal -C of sparse matrices", -C Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. - -C Local variables and parameters - INTEGER I,J,K - DOUBLE PRECISION FACT,ZERO,RINF - PARAMETER (ZERO=0.0D+00) -C External routines and functions -c EXTERNAL FD05AD -c DOUBLE PRECISION FD05AD - EXTERNAL MC21AD,MC64BD,MC64RD,MC64SD,MC64WD, HY_DLAMCH - DOUBLE PRECISION HY_DLAMCH -C Intrinsic functions - INTRINSIC ABS,LOG - -C Set RINF to largest positive real number (infinity) -c XSL RINF = FD05AD(5) - RINF = HY_DLAMCH('Overflow') - -C Check value of JOB - IF (JOB.LT.1 .OR. JOB.GT.5) THEN - INFO(1) = -1 - INFO(2) = JOB - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9001) INFO(1),'JOB',JOB - GO TO 99 - ENDIF -C Check value of N - IF (N.LT.1) THEN - INFO(1) = -2 - INFO(2) = N - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9001) INFO(1),'N',N - GO TO 99 - ENDIF -C Check value of NE - IF (NE.LT.1) THEN - INFO(1) = -3 - INFO(2) = NE - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9001) INFO(1),'NE',NE - GO TO 99 - ENDIF -C Check LIW - IF (JOB.EQ.1) K = 5*N - IF (JOB.EQ.2) K = 4*N - IF (JOB.EQ.3) K = 10*N + NE - IF (JOB.EQ.4) K = 5*N - IF (JOB.EQ.5) K = 5*N - IF (LIW.LT.K) THEN - INFO(1) = -4 - INFO(2) = K - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9004) INFO(1),K - GO TO 99 - ENDIF -C Check LDW -C If JOB = 1, do not check - IF (JOB.GT.1) THEN - IF (JOB.EQ.2) K = N - IF (JOB.EQ.3) K = NE - IF (JOB.EQ.4) K = 2*N + NE - IF (JOB.EQ.5) K = 3*N + NE - IF (LDW.LT.K) THEN - INFO(1) = -5 - INFO(2) = K - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9005) INFO(1),K - GO TO 99 - ENDIF - ENDIF - IF (ICNTL(4).EQ.0) THEN -C Check row indices. Use IW(1:N) as workspace - DO 3 I = 1,N - IW(I) = 0 - 3 CONTINUE - DO 6 J = 1,N - DO 4 K = IP(J),IP(J+1)-1 - I = IRN(K) -C Check for row indices that are out of range - IF (I.LT.1 .OR. I.GT.N) THEN - INFO(1) = -6 - INFO(2) = J - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9006) INFO(1),J,I - GO TO 99 - ENDIF -C Check for repeated row indices within a column - IF (IW(I).EQ.J) THEN - INFO(1) = -7 - INFO(2) = J - IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9007) INFO(1),J,I - GO TO 99 - ELSE - IW(I) = J - ENDIF - 4 CONTINUE - 6 CONTINUE - ENDIF - -C Print diagnostics on input - IF (ICNTL(3).GE.0) THEN - WRITE(ICNTL(3),9020) JOB,N,NE - WRITE(ICNTL(3),9021) (IP(J),J=1,N+1) - WRITE(ICNTL(3),9022) (IRN(J),J=1,NE) - IF (JOB.GT.1) WRITE(ICNTL(3),9023) (A(J),J=1,NE) - ENDIF - -C Set components of INFO to zero - DO 8 I=1,10 - INFO(I) = 0 - 8 CONTINUE - -C Compute maximum matching with MC21A/AD - IF (JOB.EQ.1) THEN -C Put length of column J in IW(J) - DO 10 J = 1,N - IW(J) = IP(J+1) - IP(J) - 10 CONTINUE -C IW(N+1:5N) is workspace - CALL MC21AD(N,IRN,NE,IP,IW(1),CPERM,NUM,IW(N+1)) - GO TO 90 - ENDIF - -C Compute bottleneck matching - IF (JOB.EQ.2) THEN -C IW(1:5N), DW(1:N) are workspaces - CALL MC64BD(N,NE,IP,IRN,A,CPERM,NUM, - & IW(1),IW(N+1),IW(2*N+1),IW(3*N+1),DW) - GO TO 90 - ENDIF - -C Compute bottleneck matching - IF (JOB.EQ.3) THEN -C Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE - DO 20 K = 1,NE - IW(K) = IRN(K) - DW(K) = ABS(A(K)) - 20 CONTINUE -C Sort entries in each column by decreasing value. - CALL MC64RD(N,NE,IP,IW,DW) -C IW(NE+1:NE+10N) is workspace - CALL MC64SD(N,NE,IP,IW(1),DW,CPERM,NUM,IW(NE+1), - & IW(NE+N+1),IW(NE+2*N+1),IW(NE+3*N+1),IW(NE+4*N+1), - & IW(NE+5*N+1),IW(NE+6*N+1)) - GO TO 90 - ENDIF - - IF (JOB.EQ.4) THEN - DO 50 J = 1,N - FACT = ZERO - DO 30 K = IP(J),IP(J+1)-1 - IF (ABS(A(K)).GT.FACT) FACT = ABS(A(K)) - 30 CONTINUE - DO 40 K = IP(J),IP(J+1)-1 - DW(2*N+K) = FACT - ABS(A(K)) - 40 CONTINUE - 50 CONTINUE -C B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces - CALL MC64WD(N,NE,IP,IRN,DW(2*N+1),CPERM,NUM, - & IW(1),IW(N+1),IW(2*N+1),IW(3*N+1),IW(4*N+1), - & DW(1),DW(N+1)) - GO TO 90 - ENDIF - - IF (JOB.EQ.5) THEN - DO 75 J = 1,N - FACT = ZERO - DO 60 K = IP(J),IP(J+1)-1 - DW(3*N+K) = ABS(A(K)) - IF (DW(3*N+K).GT.FACT) FACT = DW(3*N+K) - 60 CONTINUE - DW(2*N+J) = FACT - IF (FACT.NE.ZERO) THEN - FACT = LOG(FACT) - ELSE - FACT = RINF/N - ENDIF - DO 70 K = IP(J),IP(J+1)-1 - IF (DW(3*N+K).NE.ZERO) THEN - DW(3*N+K) = FACT - LOG(DW(3*N+K)) - ELSE - DW(3*N+K) = RINF/N - ENDIF - 70 CONTINUE - 75 CONTINUE -C B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces - CALL MC64WD(N,NE,IP,IRN,DW(3*N+1),CPERM,NUM, - & IW(1),IW(N+1),IW(2*N+1),IW(3*N+1),IW(4*N+1), - & DW(1),DW(N+1)) - IF (NUM.EQ.N) THEN - DO 80 J = 1,N - IF (DW(2*N+J).NE.ZERO) THEN - DW(N+J) = DW(N+J) - LOG(DW(2*N+J)) - ELSE - DW(N+J) = ZERO - ENDIF - 80 CONTINUE - ENDIF -C Check size of scaling factors - FACT = 0.5*LOG(RINF) - DO 86 J = 1,N - IF (DW(J).LT.FACT .AND. DW(N+J).LT.FACT) GO TO 86 - INFO(1) = 2 - GO TO 90 - 86 CONTINUE -C GO TO 90 - ENDIF - - 90 IF (INFO(1).EQ.0 .AND. NUM.LT.N) THEN -C Matrix is structurally singular, return with warning - INFO(1) = 1 - IF (ICNTL(2).GE.0) WRITE(ICNTL(2),9011) INFO(1) - ENDIF - IF (INFO(1).EQ.2) THEN -C Scaling factors are large, return with warning - IF (ICNTL(2).GE.0) WRITE(ICNTL(2),9012) INFO(1) - ENDIF - -C Print diagnostics on output - IF (ICNTL(3).GE.0) THEN - WRITE(ICNTL(3),9030) (INFO(J),J=1,2) - WRITE(ICNTL(3),9031) NUM - WRITE(ICNTL(3),9032) (CPERM(J),J=1,N) - IF (JOB.EQ.5) THEN - WRITE(ICNTL(3),9033) (DW(J),J=1,N) - WRITE(ICNTL(3),9034) (DW(N+J),J=1,N) - ENDIF - ENDIF - -C Return from subroutine. - 99 RETURN - - 9001 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2, - & ' because ',(A),' = ',I10) - 9004 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/ - & ' LIW too small, must be at least ',I8) - 9005 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/ - & ' LDW too small, must be at least ',I8) - 9006 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/ - & ' Column ',I8, - & ' contains an entry with invalid row index ',I8) - 9007 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/ - & ' Column ',I8, - & ' contains two or more entries with row index ',I8) - 9011 FORMAT (' ****** Warning from MC64A/AD. INFO(1) = ',I2/ - & ' The matrix is structurally singular.') - 9012 FORMAT (' ****** Warning from MC64A/AD. INFO(1) = ',I2/ - & ' Some scaling factors may be too large.') - 9020 FORMAT (' ****** Input parameters for MC64A/AD:'/ - & ' JOB = ',I8/' N = ',I8/' NE = ',I8) - 9021 FORMAT (' IP(1:N+1) = ',8I8/(14X,8I8)) - 9022 FORMAT (' IRN(1:NE) = ',8I8/(14X,8I8)) - 9023 FORMAT (' A(1:NE) = ',4(1PD14.4)/(14X,4(1PD14.4))) - 9030 FORMAT (' ****** Output parameters for MC64A/AD:'/ - & ' INFO(1:2) = ',2I8) - 9031 FORMAT (' NUM = ',I8) - 9032 FORMAT (' CPERM(1:N) = ',8I8/(14X,8I8)) - 9033 FORMAT (' DW(1:N) = ',5(F11.3)/(14X,5(F11.3))) - 9034 FORMAT (' DW(N+1:2N) = ',5(F11.3)/(14X,5(F11.3))) - END - -C********************************************************************** - SUBROUTINE MC64BD(N,NE,IP,IRN,A,IPERM,NUM,JPERM,PR,Q,L,D) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER N,NE,NUM - INTEGER IP(N+1),IRN(NE),IPERM(N),JPERM(N),PR(N),Q(N),L(N) - DOUBLE PRECISION A(NE),D(N) - -C N, NE, IP, IRN are described in MC64A/AD. -C A is a REAL (DOUBLE PRECISION in the D-version) array of length -C NE. A(K), K=1..NE, must be set to the value of the entry -C that corresponds to IRN(K). It is not altered. -C IPERM is an INTEGER array of length N. On exit, it contains the -C matching: IPERM(I) = 0 or row I is matched to column IPERM(I). -C NUM is INTEGER variable. On exit, it contains the cardinality of the -C matching stored in IPERM. -C IW is an INTEGER work array of length 4N. -C DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. - -C Local variables - INTEGER I,II,J,JJ,JORD,Q0,QLEN,IDUM,JDUM,ISP,JSP, - & K,KK,KK1,KK2,I0,UP,LOW - DOUBLE PRECISION CSP,DI,DNEW,DQ0,AI,A0,BV -C Local parameters - DOUBLE PRECISION RINF,ZERO,MINONE - PARAMETER (ZERO=0.0D+0,MINONE=-1.0D+0) -C Intrinsic functions - INTRINSIC ABS,MIN -C External subroutines and/or functions -c EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DLAMCH -c DOUBLE PRECISION FD05AD, DLAMCH - EXTERNAL MC64DD,MC64ED,MC64FD, HY_DLAMCH - DOUBLE PRECISION HY_DLAMCH - -C Set RINF to largest positive real number -c XSL RINF = FD05AD(5) - RINF = HY_DLAMCH('Overflow') - -C Initialization - NUM = 0 - BV = RINF - DO 10 K = 1,N - IPERM(K) = 0 - JPERM(K) = 0 - PR(K) = IP(K) - D(K) = ZERO - 10 CONTINUE -C Scan columns of matrix; - DO 20 J = 1,N - A0 = MINONE - DO 30 K = IP(J),IP(J+1)-1 - I = IRN(K) - AI = ABS(A(K)) - IF (AI.GT.D(I)) D(I) = AI - IF (JPERM(J).NE.0) GO TO 30 - IF (AI.GE.BV) THEN - A0 = BV - IF (IPERM(I).NE.0) GO TO 30 - JPERM(J) = I - IPERM(I) = J - NUM = NUM + 1 - ELSE - IF (AI.LE.A0) GO TO 30 - A0 = AI - I0 = I - ENDIF - 30 CONTINUE - IF (A0.NE.MINONE .AND. A0.LT.BV) THEN - BV = A0 - IF (IPERM(I0).NE.0) GO TO 20 - IPERM(I0) = J - JPERM(J) = I0 - NUM = NUM + 1 - ENDIF - 20 CONTINUE -C Update BV with smallest of all the largest maximum absolute values -C of the rows. - DO 25 I = 1,N - BV = MIN(BV,D(I)) - 25 CONTINUE - IF (NUM.EQ.N) GO TO 1000 -C Rescan unassigned columns; improve initial assignment - DO 95 J = 1,N - IF (JPERM(J).NE.0) GO TO 95 - DO 50 K = IP(J),IP(J+1)-1 - I = IRN(K) - AI = ABS(A(K)) - IF (AI.LT.BV) GO TO 50 - IF (IPERM(I).EQ.0) GO TO 90 - JJ = IPERM(I) - KK1 = PR(JJ) - KK2 = IP(JJ+1) - 1 - IF (KK1.GT.KK2) GO TO 50 - DO 70 KK = KK1,KK2 - II = IRN(KK) - IF (IPERM(II).NE.0) GO TO 70 - IF (ABS(A(KK)).GE.BV) GO TO 80 - 70 CONTINUE - PR(JJ) = KK2 + 1 - 50 CONTINUE - GO TO 95 - 80 JPERM(JJ) = II - IPERM(II) = JJ - PR(JJ) = KK + 1 - 90 NUM = NUM + 1 - JPERM(J) = I - IPERM(I) = J - PR(J) = K + 1 - 95 CONTINUE - IF (NUM.EQ.N) GO TO 1000 - -C Prepare for main loop - DO 99 I = 1,N - D(I) = MINONE - L(I) = 0 - 99 CONTINUE - -C Main loop ... each pass round this loop is similar to Dijkstra's -C algorithm for solving the single source shortest path problem - - DO 100 JORD = 1,N - - IF (JPERM(JORD).NE.0) GO TO 100 - QLEN = 0 - LOW = N + 1 - UP = N + 1 -C CSP is cost of shortest path to any unassigned row -C ISP is matrix position of unassigned row element in shortest path -C JSP is column index of unassigned row element in shortest path - CSP = MINONE -C Build shortest path tree starting from unassigned column JORD - J = JORD - PR(J) = -1 - -C Scan column J - DO 115 K = IP(J),IP(J+1)-1 - I = IRN(K) - DNEW = ABS(A(K)) - IF (CSP.GE.DNEW) GO TO 115 - IF (IPERM(I).EQ.0) THEN -C Row I is unassigned; update shortest path info - CSP = DNEW - ISP = I - JSP = J - IF (CSP.GE.BV) GO TO 160 - ELSE - D(I) = DNEW - IF (DNEW.GE.BV) THEN -C Add row I to Q2 - LOW = LOW - 1 - Q(LOW) = I - ELSE -C Add row I to Q, and push it - QLEN = QLEN + 1 - L(I) = QLEN - CALL MC64DD(I,N,Q,D,L,1) - ENDIF - JJ = IPERM(I) - PR(JJ) = J - ENDIF - 115 CONTINUE - - DO 150 JDUM = 1,NUM -C If Q2 is empty, extract new rows from Q - IF (LOW.EQ.UP) THEN - IF (QLEN.EQ.0) GO TO 160 - I = Q(1) - IF (CSP.GE.D(I)) GO TO 160 - BV = D(I) - DO 152 IDUM = 1,N - CALL MC64ED(QLEN,N,Q,D,L,1) - L(I) = 0 - LOW = LOW - 1 - Q(LOW) = I - IF (QLEN.EQ.0) GO TO 153 - I = Q(1) - IF (D(I).NE.BV) GO TO 153 - 152 CONTINUE -C End of dummy loop; this point is never reached - ENDIF -C Move row Q0 - 153 UP = UP - 1 - Q0 = Q(UP) - DQ0 = D(Q0) - L(Q0) = UP -C Scan column that matches with row Q0 - J = IPERM(Q0) - DO 155 K = IP(J),IP(J+1)-1 - I = IRN(K) -C Update D(I) - IF (L(I).GE.UP) GO TO 155 - DNEW = MIN(DQ0,ABS(A(K))) - IF (CSP.GE.DNEW) GO TO 155 - IF (IPERM(I).EQ.0) THEN -C Row I is unassigned; update shortest path info - CSP = DNEW - ISP = I - JSP = J - IF (CSP.GE.BV) GO TO 160 - ELSE - DI = D(I) - IF (DI.GE.BV .OR. DI.GE.DNEW) GO TO 155 - D(I) = DNEW - IF (DNEW.GE.BV) THEN -C Delete row I from Q (if necessary); add row I to Q2 - IF (DI.NE.MINONE) - * CALL MC64FD(L(I),QLEN,N,Q,D,L,1) - L(I) = 0 - LOW = LOW - 1 - Q(LOW) = I - ELSE -C Add row I to Q (if necessary); push row I up Q - IF (DI.EQ.MINONE) THEN - QLEN = QLEN + 1 - L(I) = QLEN - ENDIF - CALL MC64DD(I,N,Q,D,L,1) - ENDIF -C Update tree - JJ = IPERM(I) - PR(JJ) = J - ENDIF - 155 CONTINUE - 150 CONTINUE - -C If CSP = MINONE, no augmenting path is found - 160 IF (CSP.EQ.MINONE) GO TO 190 -C Update bottleneck value - BV = MIN(BV,CSP) -C Find augmenting path by tracing backward in PR; update IPERM,JPERM - NUM = NUM + 1 - I = ISP - J = JSP - DO 170 JDUM = 1,NUM+1 - I0 = JPERM(J) - JPERM(J) = I - IPERM(I) = J - J = PR(J) - IF (J.EQ.-1) GO TO 190 - I = I0 - 170 CONTINUE -C End of dummy loop; this point is never reached - 190 DO 191 KK = UP,N - I = Q(KK) - D(I) = MINONE - L(I) = 0 - 191 CONTINUE - DO 192 KK = LOW,UP-1 - I = Q(KK) - D(I) = MINONE - 192 CONTINUE - DO 193 KK = 1,QLEN - I = Q(KK) - D(I) = MINONE - L(I) = 0 - 193 CONTINUE - - 100 CONTINUE -C End of main loop - -C BV is bottleneck value of final matching - IF (NUM.EQ.N) GO TO 1000 - -C Matrix is structurally singular, complete IPERM. -C JPERM, PR are work arrays - DO 300 J = 1,N - JPERM(J) = 0 - 300 CONTINUE - K = 0 - DO 310 I = 1,N - IF (IPERM(I).EQ.0) THEN - K = K + 1 - PR(K) = I - ELSE - J = IPERM(I) - JPERM(J) = I - ENDIF - 310 CONTINUE - K = 0 - DO 320 I = 1,N - IF (JPERM(I).NE.0) GO TO 320 - K = K + 1 - JDUM = PR(K) - IPERM(JDUM) = I - 320 CONTINUE - - 1000 RETURN - END - -C********************************************************************** - SUBROUTINE MC64DD(I,N,Q,D,L,IWAY) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER I,N,IWAY - INTEGER Q(N),L(N) - DOUBLE PRECISION D(N) - -C Variables N,Q,D,L are described in MC64B/BD -C IF IWAY is equal to 1, then -C node I is pushed from its current position upwards -C IF IWAY is not equal to 1, then -C node I is pushed from its current position downwards - -C Local variables and parameters - INTEGER IDUM,K,POS,POSK,QK - PARAMETER (K=2) - DOUBLE PRECISION DI - - DI = D(I) - POS = L(I) -C POS is index of current position of I in the tree - IF (IWAY.EQ.1) THEN - DO 10 IDUM = 1,N - IF (POS.LE.1) GO TO 20 - POSK = POS/K - QK = Q(POSK) - IF (DI.LE.D(QK)) GO TO 20 - Q(POS) = QK - L(QK) = POS - POS = POSK - 10 CONTINUE -C End of dummy loop; this point is never reached - ELSE - DO 15 IDUM = 1,N - IF (POS.LE.1) GO TO 20 - POSK = POS/K - QK = Q(POSK) - IF (DI.GE.D(QK)) GO TO 20 - Q(POS) = QK - L(QK) = POS - POS = POSK - 15 CONTINUE -C End of dummy loop; this point is never reached - ENDIF -C End of dummy if; this point is never reached - 20 Q(POS) = I - L(I) = POS - - RETURN - END - -C********************************************************************** - SUBROUTINE MC64ED(QLEN,N,Q,D,L,IWAY) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER QLEN,N,IWAY - INTEGER Q(N),L(N) - DOUBLE PRECISION D(N) - -C Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or -C MC64W/WD (IWAY = 2) -C The root node is deleted from the binary heap. - -C Local variables and parameters - INTEGER I,IDUM,K,POS,POSK - PARAMETER (K=2) - DOUBLE PRECISION DK,DR,DI - -C Move last element to begin of Q - I = Q(QLEN) - DI = D(I) - QLEN = QLEN - 1 - POS = 1 - IF (IWAY.EQ.1) THEN - DO 10 IDUM = 1,N - POSK = K*POS - IF (POSK.GT.QLEN) GO TO 20 - DK = D(Q(POSK)) - IF (POSK.LT.QLEN) THEN - DR = D(Q(POSK+1)) - IF (DK.LT.DR) THEN - POSK = POSK + 1 - DK = DR - ENDIF - ENDIF - IF (DI.GE.DK) GO TO 20 -C Exchange old last element with larger priority child - Q(POS) = Q(POSK) - L(Q(POS)) = POS - POS = POSK - 10 CONTINUE -C End of dummy loop; this point is never reached - ELSE - DO 15 IDUM = 1,N - POSK = K*POS - IF (POSK.GT.QLEN) GO TO 20 - DK = D(Q(POSK)) - IF (POSK.LT.QLEN) THEN - DR = D(Q(POSK+1)) - IF (DK.GT.DR) THEN - POSK = POSK + 1 - DK = DR - ENDIF - ENDIF - IF (DI.LE.DK) GO TO 20 -C Exchange old last element with smaller child - Q(POS) = Q(POSK) - L(Q(POS)) = POS - POS = POSK - 15 CONTINUE -C End of dummy loop; this point is never reached - ENDIF -C End of dummy if; this point is never reached - 20 Q(POS) = I - L(I) = POS - - RETURN - END - -C********************************************************************** - SUBROUTINE MC64FD(POS0,QLEN,N,Q,D,L,IWAY) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER POS0,QLEN,N,IWAY - INTEGER Q(N),L(N) - DOUBLE PRECISION D(N) - -C Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or -C MC64WD (IWAY = 2). -C Move last element in the heap - - INTEGER I,IDUM,K,POS,POSK,QK - PARAMETER (K=2) - DOUBLE PRECISION DK,DR,DI - -C Quick return, if possible - IF (QLEN.EQ.POS0) THEN - QLEN = QLEN - 1 - RETURN - ENDIF - -C Move last element from queue Q to position POS0 -C POS is current position of node I in the tree - I = Q(QLEN) - DI = D(I) - QLEN = QLEN - 1 - POS = POS0 - IF (IWAY.EQ.1) THEN - DO 10 IDUM = 1,N - IF (POS.LE.1) GO TO 20 - POSK = POS/K - QK = Q(POSK) - IF (DI.LE.D(QK)) GO TO 20 - Q(POS) = QK - L(QK) = POS - POS = POSK - 10 CONTINUE -C End of dummy loop; this point is never reached - 20 Q(POS) = I - L(I) = POS - DO 30 IDUM = 1,N - POSK = K*POS - IF (POSK.GT.QLEN) GO TO 40 - DK = D(Q(POSK)) - IF (POSK.LT.QLEN) THEN - DR = D(Q(POSK+1)) - IF (DK.LT.DR) THEN - POSK = POSK + 1 - DK = DR - ENDIF - ENDIF - IF (DI.GE.DK) GO TO 40 - QK = Q(POSK) - Q(POS) = QK - L(QK) = POS - POS = POSK - 30 CONTINUE -C End of dummy loop; this point is never reached - ELSE - DO 32 IDUM = 1,N - IF (POS.LE.1) GO TO 34 - POSK = POS/K - QK = Q(POSK) - IF (DI.GE.D(QK)) GO TO 34 - Q(POS) = QK - L(QK) = POS - POS = POSK - 32 CONTINUE -C End of dummy loop; this point is never reached - 34 Q(POS) = I - L(I) = POS - DO 36 IDUM = 1,N - POSK = K*POS - IF (POSK.GT.QLEN) GO TO 40 - DK = D(Q(POSK)) - IF (POSK.LT.QLEN) THEN - DR = D(Q(POSK+1)) - IF (DK.GT.DR) THEN - POSK = POSK + 1 - DK = DR - ENDIF - ENDIF - IF (DI.LE.DK) GO TO 40 - QK = Q(POSK) - Q(POS) = QK - L(QK) = POS - POS = POSK - 36 CONTINUE -C End of dummy loop; this point is never reached - ENDIF -C End of dummy if; this point is never reached - 40 Q(POS) = I - L(I) = POS - - RETURN - END - -C********************************************************************** - SUBROUTINE MC64RD(N,NE,IP,IRN,A) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER N,NE - INTEGER IP(N+1),IRN(NE) - DOUBLE PRECISION A(NE) - -C This subroutine sorts the entries in each column of the -C sparse matrix (defined by N,NE,IP,IRN,A) by decreasing -C numerical value. - -C Local constants - INTEGER THRESH,TDLEN - PARAMETER (THRESH=15,TDLEN=50) -C Local variables - INTEGER J,IPJ,K,LEN,R,S,HI,FIRST,MID,LAST,TD - DOUBLE PRECISION HA,KEY -C Local arrays - INTEGER TODO(TDLEN) - - DO 100 J = 1,N - LEN = IP(J+1) - IP(J) - IF (LEN.LE.1) GO TO 100 - IPJ = IP(J) - -C Sort array roughly with partial quicksort - IF (LEN.LT.THRESH) GO TO 400 - TODO(1) = IPJ - TODO(2) = IPJ + LEN - TD = 2 - 500 CONTINUE - FIRST = TODO(TD-1) - LAST = TODO(TD) -C KEY is the smallest of two values present in interval [FIRST,LAST) - KEY = A((FIRST+LAST)/2) - DO 475 K = FIRST,LAST-1 - HA = A(K) - IF (HA.EQ.KEY) GO TO 475 - IF (HA.GT.KEY) GO TO 470 - KEY = HA - GO TO 470 - 475 CONTINUE -C Only one value found in interval, so it is already sorted - TD = TD - 2 - GO TO 425 - -C Reorder interval [FIRST,LAST) such that entries before MID are gt KEY - 470 MID = FIRST - DO 450 K = FIRST,LAST-1 - IF (A(K).LE.KEY) GO TO 450 - HA = A(MID) - A(MID) = A(K) - A(K) = HA - HI = IRN(MID) - IRN(MID) = IRN(K) - IRN(K) = HI - MID = MID + 1 - 450 CONTINUE -C Both subintervals [FIRST,MID), [MID,LAST) are nonempty -C Stack the longest of the two subintervals first - IF (MID-FIRST.GE.LAST-MID) THEN - TODO(TD+2) = LAST - TODO(TD+1) = MID - TODO(TD) = MID -C TODO(TD-1) = FIRST - ELSE - TODO(TD+2) = MID - TODO(TD+1) = FIRST - TODO(TD) = LAST - TODO(TD-1) = MID - ENDIF - TD = TD + 2 - - 425 CONTINUE - IF (TD.EQ.0) GO TO 400 -C There is still work to be done - IF (TODO(TD)-TODO(TD-1).GE.THRESH) GO TO 500 -C Next interval is already short enough for straightforward insertion - TD = TD - 2 - GO TO 425 - -C Complete sorting with straightforward insertion - 400 DO 200 R = IPJ+1,IPJ+LEN-1 - IF (A(R-1) .LT. A(R)) THEN - HA = A(R) - HI = IRN(R) - A(R) = A(R-1) - IRN(R) = IRN(R-1) - DO 300 S = R-1,IPJ+1,-1 - IF (A(S-1) .LT. HA) THEN - A(S) = A(S-1) - IRN(S) = IRN(S-1) - ELSE - A(S) = HA - IRN(S) = HI - GO TO 200 - END IF - 300 CONTINUE - A(IPJ) = HA - IRN(IPJ) = HI - END IF - 200 CONTINUE - - 100 CONTINUE - - RETURN - END - -C********************************************************************** - SUBROUTINE MC64SD(N,NE,IP,IRN,A,IPERM,NUMX, - & W,LEN,LENL,LENH,FC,IW,IW4) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER N,NE,NUMX - INTEGER IP(N+1),IRN(NE),IPERM(N), - & W(N),LEN(N),LENL(N),LENH(N),FC(N),IW(N),IW4(4*N) - DOUBLE PRECISION A(NE) - -C N, NE, IP, IRN, are described in MC64A/AD. -C A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. -C A(K), K=1..NE, must be set to the value of the entry that -C corresponds to IRN(k). The entries in each column must be -C non-negative and ordered by decreasing value. -C IPERM is an INTEGER array of length N. On exit, it contains the -C bottleneck matching: IPERM(I) - 0 or row I is matched to column -C IPERM(I). -C NUMX is an INTEGER variable. On exit, it contains the cardinality -C of the matching stored in IPERM. -C IW is an INTEGER work array of length 10N. - -C FC is an integer array of length N that contains the list of -C unmatched columns. -C LEN(J), LENL(J), LENH(J) are integer arrays of length N that point -C to entries in matrix column J. -C In the matrix defined by the column parts IP(J)+LENL(J) we know -C a matching does not exist; in the matrix defined by the column -C parts IP(J)+LENH(J) we know one exists. -C LEN(J) lies between LENL(J) and LENH(J) and determines the matrix -C that is tested for a maximum matching. -C W is an integer array of length N and contains the indices of the -C columns for which LENL ne LENH. -C WLEN is number of indices stored in array W. -C IW is integer work array of length N. -C IW4 is integer work array of length 4N used by MC64U/UD. - - INTEGER NUM,NVAL,WLEN,II,I,J,K,L,CNT,MOD,IDUM1,IDUM2,IDUM3 - DOUBLE PRECISION BVAL,BMIN,BMAX,RINF -c EXTERNAL FD05AD,MC64QD,MC64UD -c DOUBLE PRECISION FD05AD - EXTERNAL MC64QD,MC64UD, HY_DLAMCH - DOUBLE PRECISION HY_DLAMCH - -C BMIN and BMAX are such that a maximum matching exists for the input -C matrix in which all entries smaller than BMIN are dropped. -C For BMAX, a maximum matching does not exist. -C BVAL is a value between BMIN and BMAX. -C CNT is the number of calls made to MC64U/UD so far. -C NUM is the cardinality of last matching found. - -C Set RINF to largest positive real number -c XSL RINF = FD05AD(5) - RINF = HY_DLAMCH('Overflow') - -C Compute a first maximum matching from scratch on whole matrix. - DO 20 J = 1,N - FC(J) = J - IW(J) = 0 - LEN(J) = IP(J+1) - IP(J) - 20 CONTINUE -C The first call to MC64U/UD - CNT = 1 - MOD = 1 - NUMX = 0 - CALL MC64UD(CNT,MOD,N,IRN,NE,IP,LEN,FC,IW,NUMX,N, - & IW4(1),IW4(N+1),IW4(2*N+1),IW4(3*N+1)) - -C IW contains a maximum matching of length NUMX. - NUM = NUMX - - IF (NUM.NE.N) THEN -C Matrix is structurally singular - BMAX = RINF - ELSE -C Matrix is structurally nonsingular, NUM=NUMX=N; -C Set BMAX just above the smallest of all the maximum absolute -C values of the columns - BMAX = RINF - DO 30 J = 1,N - BVAL = 0.0 - DO 25 K = IP(J),IP(J+1)-1 - IF (A(K).GT.BVAL) BVAL = A(K) - 25 CONTINUE - IF (BVAL.LT.BMAX) BMAX = BVAL - 30 CONTINUE - BMAX = 1.001 * BMAX - ENDIF - -C Initialize BVAL,BMIN - BVAL = 0.0 - BMIN = 0.0 -C Initialize LENL,LEN,LENH,W,WLEN according to BMAX. -C Set LEN(J), LENH(J) just after last entry in column J. -C Set LENL(J) just after last entry in column J with value ge BMAX. - WLEN = 0 - DO 48 J = 1,N - L = IP(J+1) - IP(J) - LENH(J) = L - LEN(J) = L - DO 45 K = IP(J),IP(J+1)-1 - IF (A(K).LT.BMAX) GO TO 46 - 45 CONTINUE -C Column J is empty or all entries are ge BMAX - K = IP(J+1) - 46 LENL(J) = K - IP(J) -C Add J to W if LENL(J) ne LENH(J) - IF (LENL(J).EQ.L) GO TO 48 - WLEN = WLEN + 1 - W(WLEN) = J - 48 CONTINUE - -C Main loop - DO 90 IDUM1 = 1,NE - IF (NUM.EQ.NUMX) THEN -C We have a maximum matching in IW; store IW in IPERM - DO 50 I = 1,N - IPERM(I) = IW(I) - 50 CONTINUE -C Keep going round this loop until matching IW is no longer maximum. - DO 80 IDUM2 = 1,NE - BMIN = BVAL - IF (BMAX .EQ. BMIN) GO TO 99 -C Find splitting value BVAL - CALL MC64QD(IP,LENL,LEN,W,WLEN,A,NVAL,BVAL) - IF (NVAL.LE.1) GO TO 99 -C Set LEN such that all matrix entries with value lt BVAL are -C discarded. Store old LEN in LENH. Do this for all columns W(K). -C Each step, either K is incremented or WLEN is decremented. - K = 1 - DO 70 IDUM3 = 1,N - IF (K.GT.WLEN) GO TO 71 - J = W(K) - DO 55 II = IP(J)+LEN(J)-1,IP(J)+LENL(J),-1 - IF (A(II).GE.BVAL) GO TO 60 - I = IRN(II) - IF (IW(I).NE.J) GO TO 55 -C Remove entry from matching - IW(I) = 0 - NUM = NUM - 1 - FC(N-NUM) = J - 55 CONTINUE - 60 LENH(J) = LEN(J) -C IP(J)+LEN(J)-1 is last entry in column ge BVAL - LEN(J) = II - IP(J) + 1 -C If LENH(J) = LENL(J), remove J from W - IF (LENL(J).EQ.LENH(J)) THEN - W(K) = W(WLEN) - WLEN = WLEN - 1 - ELSE - K = K + 1 - ENDIF - 70 CONTINUE - 71 IF (NUM.LT.NUMX) GO TO 81 - 80 CONTINUE -C End of dummy loop; this point is never reached -C Set mode for next call to MC64U/UD - 81 MOD = 1 - ELSE -C We do not have a maximum matching in IW. - BMAX = BVAL -C BMIN is the bottleneck value of a maximum matching; -C for BMAX the matching is not maximum, so BMAX>BMIN -C IF (BMAX .EQ. BMIN) GO TO 99 -C Find splitting value BVAL - CALL MC64QD(IP,LEN,LENH,W,WLEN,A,NVAL,BVAL) - IF (NVAL.EQ.0. OR. BVAL.EQ.BMIN) GO TO 99 -C Set LEN such that all matrix entries with value ge BVAL are -C inside matrix. Store old LEN in LENL. Do this for all columns W(K). -C Each step, either K is incremented or WLEN is decremented. - K = 1 - DO 87 IDUM3 = 1,N - IF (K.GT.WLEN) GO TO 88 - J = W(K) - DO 85 II = IP(J)+LEN(J),IP(J)+LENH(J)-1 - IF (A(II).LT.BVAL) GO TO 86 - 85 CONTINUE - 86 LENL(J) = LEN(J) - LEN(J) = II - IP(J) - IF (LENL(J).EQ.LENH(J)) THEN - W(K) = W(WLEN) - WLEN = WLEN - 1 - ELSE - K = K + 1 - ENDIF - 87 CONTINUE -C End of dummy loop; this point is never reached -C Set mode for next call to MC64U/UD - 88 MOD = 0 - ENDIF - CNT = CNT + 1 - CALL MC64UD(CNT,MOD,N,IRN,NE,IP,LEN,FC,IW,NUM,NUMX, - & IW4(1),IW4(N+1),IW4(2*N+1),IW4(3*N+1)) - -C IW contains maximum matching of length NUM - 90 CONTINUE -C End of dummy loop; this point is never reached - -C BMIN is bottleneck value of final matching - 99 IF (NUMX.EQ.N) GO TO 1000 -C The matrix is structurally singular, complete IPERM -C W, IW are work arrays - DO 300 J = 1,N - W(J) = 0 - 300 CONTINUE - K = 0 - DO 310 I = 1,N - IF (IPERM(I).EQ.0) THEN - K = K + 1 - IW(K) = I - ELSE - J = IPERM(I) - W(J) = I - ENDIF - 310 CONTINUE - K = 0 - DO 320 J = 1,N - IF (W(J).NE.0) GO TO 320 - K = K + 1 - IDUM1 = IW(K) - IPERM(IDUM1) = J - 320 CONTINUE - - 1000 RETURN - END - -C********************************************************************** - SUBROUTINE MC64QD(IP,LENL,LENH,W,WLEN,A,NVAL,VAL) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER WLEN,NVAL - INTEGER IP(*),LENL(*),LENH(*),W(*) - DOUBLE PRECISION A(*),VAL - -C This routine searches for at most XX different numerical values -C in the columns W(1:WLEN). XX>=2. -C Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 -C until XX values are found or all columns have been considered. -C On output, NVAL is the number of different values that is found -C and SPLIT(1:NVAL) contains the values in decreasing order. -C If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). -C - INTEGER XX,J,K,II,S,POS - PARAMETER (XX=10) - DOUBLE PRECISION SPLIT(XX),HA - -C Scan columns in W(1:WLEN). For each encountered value, if value not -C already present in SPLIT(1:NVAL), insert value such that SPLIT -C remains sorted by decreasing value. -C The sorting is done by straightforward insertion; therefore the use -C of this routine should be avoided for large XX (XX < 20). - NVAL = 0 - DO 10 K = 1,WLEN - J = W(K) - DO 15 II = IP(J)+LENL(J),IP(J)+LENH(J)-1 - HA = A(II) - IF (NVAL.EQ.0) THEN - SPLIT(1) = HA - NVAL = 1 - ELSE -C Check presence of HA in SPLIT - DO 20 S = NVAL,1,-1 - IF (SPLIT(S).EQ.HA) GO TO 15 - IF (SPLIT(S).GT.HA) THEN - POS = S + 1 - GO TO 21 - ENDIF - 20 CONTINUE - POS = 1 -C The insertion - 21 DO 22 S = NVAL,POS,-1 - SPLIT(S+1) = SPLIT(S) - 22 CONTINUE - SPLIT(POS) = HA - NVAL = NVAL + 1 - ENDIF -C Exit loop if XX values are found - IF (NVAL.EQ.XX) GO TO 11 - 15 CONTINUE - 10 CONTINUE -C Determine VAL - 11 IF (NVAL.GT.0) VAL = SPLIT((NVAL+1)/2) - - RETURN - END - -C********************************************************************** - SUBROUTINE MC64UD(ID,MOD,N,IRN,LIRN,IP,LENC,FC,IPERM,NUM,NUMX, - & PR,ARP,CV,OUT) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER ID,MOD,N,LIRN,NUM,NUMX - INTEGER ARP(N),CV(N),IRN(LIRN),IP(N), - & FC(N),IPERM(N),LENC(N),OUT(N),PR(N) - -C PR(J) is the previous column to J in the depth first search. -C Array PR is used as workspace in the sorting algorithm. -C Elements (I,IPERM(I)) I=1,..,N are entries at the end of the -C algorithm unless N assignments have not been made in which case -C N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. -C CV(I) is the most recent loop number (ID+JORD) at which row I -C was visited. -C ARP(J) is the number of entries in column J which have been scanned -C when looking for a cheap assignment. -C OUT(J) is one less than the number of entries in column J which have -C not been scanned during one pass through the main loop. -C NUMX is maximum possible size of matching. - - INTEGER I,II,IN1,IN2,J,J1,JORD,K,KK,LAST,NFC, - & NUM0,NUM1,NUM2,ID0,ID1 - - IF (ID.EQ.1) THEN -C The first call to MC64U/UD. -C Initialize CV and ARP; parameters MOD, NUMX are not accessed - DO 5 I = 1,N - CV(I) = 0 - ARP(I) = 0 - 5 CONTINUE - NUM1 = N - NUM2 = N - ELSE -C Not the first call to MC64U/UD. -C Re-initialize ARP if entries were deleted since last call to MC64U/UD - IF (MOD.EQ.1) THEN - DO 8 I = 1,N - ARP(I) = 0 - 8 CONTINUE - ENDIF - NUM1 = NUMX - NUM2 = N - NUMX - ENDIF - NUM0 = NUM - -C NUM0 is size of input matching -C NUM1 is maximum possible size of matching -C NUM2 is maximum allowed number of unassigned rows/columns -C NUM is size of current matching - -C Quick return if possible -C IF (NUM.EQ.N) GO TO 199 -C NFC is number of rows/columns that could not be assigned - NFC = 0 -C Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, -C so 1st call uses 1..N, 2nd call uses N+1..2N, etc - ID0 = (ID-1)*N - -C Main loop. Each pass round this loop either results in a new -C assignment or gives a column with no assignment - - DO 100 JORD = NUM0+1,N - -C Each pass uses unique number ID1 - ID1 = ID0 + JORD -C J is unmatched column - J = FC(JORD-NUM0) - PR(J) = -1 - DO 70 K = 1,JORD -C Look for a cheap assignment - IF (ARP(J).GE.LENC(J)) GO TO 30 - IN1 = IP(J) + ARP(J) - IN2 = IP(J) + LENC(J) - 1 - DO 20 II = IN1,IN2 - I = IRN(II) - IF (IPERM(I).EQ.0) GO TO 80 - 20 CONTINUE -C No cheap assignment in row - ARP(J) = LENC(J) -C Begin looking for assignment chain starting with row J - 30 OUT(J) = LENC(J) - 1 -C Inner loop. Extends chain by one or backtracks - DO 60 KK = 1,JORD - IN1 = OUT(J) - IF (IN1.LT.0) GO TO 50 - IN2 = IP(J) + LENC(J) - 1 - IN1 = IN2 - IN1 -C Forward scan - DO 40 II = IN1,IN2 - I = IRN(II) - IF (CV(I).EQ.ID1) GO TO 40 -C Column J has not yet been accessed during this pass - J1 = J - J = IPERM(I) - CV(I) = ID1 - PR(J) = J1 - OUT(J1) = IN2 - II - 1 - GO TO 70 - 40 CONTINUE -C Backtracking step. - 50 J1 = PR(J) - IF (J1.EQ.-1) THEN -C No augmenting path exists for column J. - NFC = NFC + 1 - FC(NFC) = J - IF (NFC.GT.NUM2) THEN -C A matching of maximum size NUM1 is not possible - LAST = JORD - GO TO 101 - ENDIF - GO TO 100 - ENDIF - J = J1 - 60 CONTINUE -C End of dummy loop; this point is never reached - 70 CONTINUE -C End of dummy loop; this point is never reached - -C New assignment is made. - 80 IPERM(I) = J - ARP(J) = II - IP(J) + 1 - NUM = NUM + 1 - DO 90 K = 1,JORD - J = PR(J) - IF (J.EQ.-1) GO TO 95 - II = IP(J) + LENC(J) - OUT(J) - 2 - I = IRN(II) - IPERM(I) = J - 90 CONTINUE -C End of dummy loop; this point is never reached - - 95 IF (NUM.EQ.NUM1) THEN -C A matching of maximum size NUM1 is found - LAST = JORD - GO TO 101 - ENDIF -C - 100 CONTINUE - -C All unassigned columns have been considered - LAST = N - -C Now, a transversal is computed or is not possible. -C Complete FC before returning. - 101 DO 110 JORD = LAST+1,N - NFC = NFC + 1 - FC(NFC) = FC(JORD-NUM0) - 110 CONTINUE - -C 199 RETURN - RETURN - END - -C********************************************************************** - SUBROUTINE MC64WD(N,NE,IP,IRN,A,IPERM,NUM, - & JPERM,OUT,PR,Q,L,U,D) - IMPLICIT NONE -C -C *** Copyright (c) 1999 Council for the Central Laboratory of the -C Research Councils *** -C *** Although every effort has been made to ensure robustness and *** -C *** reliability of the subroutines in this MC64 suite, we *** -C *** disclaim any liability arising through the use or misuse of *** -C *** any of the subroutines. *** -C *** Any problems? Contact ... -C Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** -C - INTEGER N,NE,NUM - INTEGER IP(N+1),IRN(NE),IPERM(N), - & JPERM(N),OUT(N),PR(N),Q(N),L(N) - DOUBLE PRECISION A(NE),U(N),D(N) - -C N, NE, IP, IRN are described in MC64A/AD. -C A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. -C A(K), K=1..NE, must be set to the value of the entry that -C corresponds to IRN(K). It is not altered. -C All values A(K) must be non-negative. -C IPERM is an INTEGER array of length N. On exit, it contains the -C weighted matching: IPERM(I) = 0 or row I is matched to column -C IPERM(I). -C NUM is an INTEGER variable. On exit, it contains the cardinality of -C the matching stored in IPERM. -C IW is an INTEGER work array of length 5N. -C DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. -C On exit, U = D(1:N) contains the dual row variable and -C V = D(N+1:2N) contains the dual column variable. If the matrix -C is structurally nonsingular (NUM = N), the following holds: -C U(I)+V(J) <= A(I,J) if IPERM(I) |= J -C U(I)+V(J) = A(I,J) if IPERM(I) = J -C U(I) = 0 if IPERM(I) = 0 -C V(J) = 0 if there is no I for which IPERM(I) = J - -C Local variables - INTEGER I,I0,II,J,JJ,JORD,Q0,QLEN,JDUM,ISP,JSP, - & K,K0,K1,K2,KK,KK1,KK2,UP,LOW - DOUBLE PRECISION CSP,DI,DMIN,DNEW,DQ0,VJ -C Local parameters - DOUBLE PRECISION RINF,ZERO - PARAMETER (ZERO=0.0D+0) -C External subroutines and/or functions -c EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD -c DOUBLE PRECISION FD05AD - EXTERNAL MC64DD,MC64ED,MC64FD, HY_DLAMCH - DOUBLE PRECISION HY_DLAMCH - - -C Set RINF to largest positive real number -c XSL RINF = FD05AD(5) - RINF = HY_DLAMCH('Overflow') - -C Initialization - NUM = 0 - DO 10 K = 1,N - U(K) = RINF - D(K) = ZERO - IPERM(K) = 0 - JPERM(K) = 0 - PR(K) = IP(K) - L(K) = 0 - 10 CONTINUE -C Initialize U(I) - DO 30 J = 1,N - DO 20 K = IP(J),IP(J+1)-1 - I = IRN(K) - IF (A(K).GT.U(I)) GO TO 20 - U(I) = A(K) - IPERM(I) = J - L(I) = K - 20 CONTINUE - 30 CONTINUE - DO 40 I = 1,N - J = IPERM(I) - IF (J.EQ.0) GO TO 40 -C Row I is not empty - IPERM(I) = 0 - IF (JPERM(J).NE.0) GO TO 40 -C Assignment of column J to row I - NUM = NUM + 1 - IPERM(I) = J - JPERM(J) = L(I) - 40 CONTINUE - IF (NUM.EQ.N) GO TO 1000 -C Scan unassigned columns; improve assignment - DO 95 J = 1,N -C JPERM(J) ne 0 iff column J is already assigned - IF (JPERM(J).NE.0) GO TO 95 - K1 = IP(J) - K2 = IP(J+1) - 1 -C Continue only if column J is not empty - IF (K1.GT.K2) GO TO 95 - VJ = RINF - DO 50 K = K1,K2 - I = IRN(K) - DI = A(K) - U(I) - IF (DI.GT.VJ) GO TO 50 - IF (DI.LT.VJ .OR. DI.EQ.RINF) GO TO 55 - IF (IPERM(I).NE.0 .OR. IPERM(I0).EQ.0) GO TO 50 - 55 VJ = DI - I0 = I - K0 = K - 50 CONTINUE - D(J) = VJ - K = K0 - I = I0 - IF (IPERM(I).EQ.0) GO TO 90 - DO 60 K = K0,K2 - I = IRN(K) - IF (A(K)-U(I).GT.VJ) GO TO 60 - JJ = IPERM(I) -C Scan remaining part of assigned column JJ - KK1 = PR(JJ) - KK2 = IP(JJ+1) - 1 - IF (KK1.GT.KK2) GO TO 60 - DO 70 KK = KK1,KK2 - II = IRN(KK) - IF (IPERM(II).GT.0) GO TO 70 - IF (A(KK)-U(II).LE.D(JJ)) GO TO 80 - 70 CONTINUE - PR(JJ) = KK2 + 1 - 60 CONTINUE - GO TO 95 - 80 JPERM(JJ) = KK - IPERM(II) = JJ - PR(JJ) = KK + 1 - 90 NUM = NUM + 1 - JPERM(J) = K - IPERM(I) = J - PR(J) = K + 1 - 95 CONTINUE - IF (NUM.EQ.N) GO TO 1000 - -C Prepare for main loop - DO 99 I = 1,N - D(I) = RINF - L(I) = 0 - 99 CONTINUE - -C Main loop ... each pass round this loop is similar to Dijkstra's -C algorithm for solving the single source shortest path problem - - DO 100 JORD = 1,N - - IF (JPERM(JORD).NE.0) GO TO 100 -C JORD is next unmatched column -C DMIN is the length of shortest path in the tree - DMIN = RINF - QLEN = 0 - LOW = N + 1 - UP = N + 1 -C CSP is the cost of the shortest augmenting path to unassigned row -C IRN(ISP). The corresponding column index is JSP. - CSP = RINF -C Build shortest path tree starting from unassigned column (root) JORD - J = JORD - PR(J) = -1 - -C Scan column J - DO 115 K = IP(J),IP(J+1)-1 - I = IRN(K) - DNEW = A(K) - U(I) - IF (DNEW.GE.CSP) GO TO 115 - IF (IPERM(I).EQ.0) THEN - CSP = DNEW - ISP = K - JSP = J - ELSE - IF (DNEW.LT.DMIN) DMIN = DNEW - D(I) = DNEW - QLEN = QLEN + 1 - Q(QLEN) = K - ENDIF - 115 CONTINUE -C Initialize heap Q and Q2 with rows held in Q(1:QLEN) - Q0 = QLEN - QLEN = 0 - DO 120 KK = 1,Q0 - K = Q(KK) - I = IRN(K) - IF (CSP.LE.D(I)) THEN - D(I) = RINF - GO TO 120 - ENDIF - IF (D(I).LE.DMIN) THEN - LOW = LOW - 1 - Q(LOW) = I - L(I) = LOW - ELSE - QLEN = QLEN + 1 - L(I) = QLEN - CALL MC64DD(I,N,Q,D,L,2) - ENDIF -C Update tree - JJ = IPERM(I) - OUT(JJ) = K - PR(JJ) = J - 120 CONTINUE - - DO 150 JDUM = 1,NUM - -C If Q2 is empty, extract rows from Q - IF (LOW.EQ.UP) THEN - IF (QLEN.EQ.0) GO TO 160 - I = Q(1) - IF (D(I).GE.CSP) GO TO 160 - DMIN = D(I) - 152 CALL MC64ED(QLEN,N,Q,D,L,2) - LOW = LOW - 1 - Q(LOW) = I - L(I) = LOW - IF (QLEN.EQ.0) GO TO 153 - I = Q(1) - IF (D(I).GT.DMIN) GO TO 153 - GO TO 152 - ENDIF -C Q0 is row whose distance D(Q0) to the root is smallest - 153 Q0 = Q(UP-1) - DQ0 = D(Q0) -C Exit loop if path to Q0 is longer than the shortest augmenting path - IF (DQ0.GE.CSP) GO TO 160 - UP = UP - 1 - -C Scan column that matches with row Q0 - J = IPERM(Q0) - VJ = DQ0 - A(JPERM(J)) + U(Q0) - DO 155 K = IP(J),IP(J+1)-1 - I = IRN(K) - IF (L(I).GE.UP) GO TO 155 -C DNEW is new cost - DNEW = VJ + A(K)-U(I) -C Do not update D(I) if DNEW ge cost of shortest path - IF (DNEW.GE.CSP) GO TO 155 - IF (IPERM(I).EQ.0) THEN -C Row I is unmatched; update shortest path info - CSP = DNEW - ISP = K - JSP = J - ELSE -C Row I is matched; do not update D(I) if DNEW is larger - DI = D(I) - IF (DI.LE.DNEW) GO TO 155 - IF (L(I).GE.LOW) GO TO 155 - D(I) = DNEW - IF (DNEW.LE.DMIN) THEN - IF (L(I).NE.0) - * CALL MC64FD(L(I),QLEN,N,Q,D,L,2) - LOW = LOW - 1 - Q(LOW) = I - L(I) = LOW - ELSE - IF (L(I).EQ.0) THEN - QLEN = QLEN + 1 - L(I) = QLEN - ENDIF - CALL MC64DD(I,N,Q,D,L,2) - ENDIF -C Update tree - JJ = IPERM(I) - OUT(JJ) = K - PR(JJ) = J - ENDIF - 155 CONTINUE - 150 CONTINUE - -C If CSP = RINF, no augmenting path is found - 160 IF (CSP.EQ.RINF) GO TO 190 -C Find augmenting path by tracing backward in PR; update IPERM,JPERM - NUM = NUM + 1 - I = IRN(ISP) - IPERM(I) = JSP - JPERM(JSP) = ISP - J = JSP - DO 170 JDUM = 1,NUM - JJ = PR(J) - IF (JJ.EQ.-1) GO TO 180 - K = OUT(J) - I = IRN(K) - IPERM(I) = JJ - JPERM(JJ) = K - J = JJ - 170 CONTINUE -C End of dummy loop; this point is never reached - -C Update U for rows in Q(UP:N) - 180 DO 185 KK = UP,N - I = Q(KK) - U(I) = U(I) + D(I) - CSP - 185 CONTINUE - 190 DO 191 KK = LOW,N - I = Q(KK) - D(I) = RINF - L(I) = 0 - 191 CONTINUE - DO 193 KK = 1,QLEN - I = Q(KK) - D(I) = RINF - L(I) = 0 - 193 CONTINUE - - 100 CONTINUE -C End of main loop - - -C Set dual column variable in D(1:N) - 1000 DO 200 J = 1,N - K = JPERM(J) - IF (K.NE.0) THEN - D(J) = A(K) - U(IRN(K)) - ELSE - D(J) = ZERO - ENDIF - IF (IPERM(J).EQ.0) U(J) = ZERO - 200 CONTINUE - - IF (NUM.EQ.N) GO TO 1100 - -C The matrix is structurally singular, complete IPERM. -C JPERM, OUT are work arrays - DO 300 J = 1,N - JPERM(J) = 0 - 300 CONTINUE - K = 0 - DO 310 I = 1,N - IF (IPERM(I).EQ.0) THEN - K = K + 1 - OUT(K) = I - ELSE - J = IPERM(I) - JPERM(J) = I - ENDIF - 310 CONTINUE - K = 0 - DO 320 J = 1,N - IF (JPERM(J).NE.0) GO TO 320 - K = K + 1 - JDUM = OUT(K) - IPERM(JDUM) = J - 320 CONTINUE - 1100 RETURN - END - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,545 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - -/* - * Global variables - */ -ExpHeader *expanders; /* Array of pointers to 4 types of memory */ -LU_stack_t stack; -int_t no_expand; - - -/* - * Prototype - */ -static int_t memory_usage(const int_t, const int_t, const int_t); -static void *expand(int_t *, MemType, int_t, int_t, - Glu_freeable_t *); - -/* - * Internal prototypes - */ -void SetupSpace (void *, int_t, LU_space_t *); - - -void -superlu_abort_and_exit_dist(char *msg) -{ - /*fprintf(stderr, msg); - fflush(stderr);*/ - printf(msg); - exit (-1); -} - -long int superlu_malloc_total = 0; - -#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ - -#define PAD_FACTOR 2 -#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ - -void *superlu_malloc_dist(size_t size) -{ - char *buf; - int iam; - - MPI_Comm_rank(MPI_COMM_WORLD, &iam); - buf = (char *) malloc(size + DWORD); - if ( !buf ) { - printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %ld\n", - iam, superlu_malloc_total*1e-6, size); - ABORT("superlu_malloc: out of memory"); - } - - ((int_t *) buf)[0] = size; -#if 0 - superlu_malloc_total += size + DWORD; -#else - superlu_malloc_total += size; -#endif - return (void *) (buf + DWORD); -} - -void superlu_free_dist(void *addr) -{ - char *p = ((char *) addr) - DWORD; - - if ( !addr ) - ABORT("superlu_free: tried to free NULL pointer"); - - if ( !p ) - ABORT("superlu_free: tried to free NULL+DWORD pointer"); - - { - int_t n = ((int_t *) p)[0]; - - if ( !n ) - ABORT("superlu_free: tried to free a freed pointer"); - *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ -#if 0 - superlu_malloc_total -= (n + DWORD); -#else - superlu_malloc_total -= n; -#endif - - if ( superlu_malloc_total < 0 ) - ABORT("superlu_malloc_total went negative"); - - /*free (addr);*/ - free (p); - } - -} - -#else /* The production mode. */ - -void *superlu_malloc_dist(size_t size) -{ - void *buf; - buf = (void *) malloc(size); - return (buf); -} - -void superlu_free_dist(void *addr) -{ - free (addr); -} - -#endif /* End debug malloc/free. */ - - - -static void -copy_mem_int(int_t howmany, void *old, void *new) -{ - register int_t i; - int_t *iold = old; - int_t *inew = new; - for (i = 0; i < howmany; i++) inew[i] = iold[i]; -} - - -static void -user_bcopy(char *src, char *dest, int_t bytes) -{ - char *s_ptr, *d_ptr; - - s_ptr = src + bytes - 1; - d_ptr = dest + bytes - 1; - for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; -} - - - -int_t *intMalloc_dist(int_t n) -{ - int_t *buf; - buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t)); - return (buf); -} - -int_t *intCalloc_dist(int_t n) -{ - int_t *buf; - register int_t i; - buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t)); - if ( buf ) - for (i = 0; i < n; ++i) buf[i] = 0; - return (buf); -} - - -void *user_malloc_dist(int_t bytes, int_t which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - -void user_free_dist(int_t bytes, int_t which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - -/* - * Setup the memory model to be used for factorization. - * lwork = 0: use system malloc; - * lwork > 0: use user-supplied work[] space. - */ -void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel) -{ - if ( lwork == 0 ) { - *MemModel = SYSTEM; /* malloc/free */ - } else if ( lwork > 0 ) { - *MemModel = USER; /* user provided space */ - stack.used = 0; - stack.top1 = 0; - stack.top2 = (lwork/4)*4; /* must be word addressable */ - stack.size = stack.top2; - stack.array = (void *) work; - } -} - - - -/* - * Allocate storage for the data structures common to symbolic factorization - * routines. For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * If lwork = -1, return the estimated amount of space required, plus n; - * otherwise, return the amount of space actually allocated when - * memory allocation failure occurred. - */ -/************************************************************************/ -int_t symbfact_SubInit -/************************************************************************/ -( - fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable - ) -{ - int_t iword; - int_t *xsup, *supno; - int_t *lsub, *xlsub; - int_t *usub, *xusub; - int_t nzlmax, nzumax; - int_t FILL = sp_ienv_dist(6); - -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter symbfact_SubInit()"); -#endif - - no_expand = 0; - iword = sizeof(int_t); - - expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) ); - if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); - - if ( fact == DOFACT || fact == SamePattern ) { - /* Guess for L\U factors */ - nzlmax = FILL * annz; - nzumax = FILL/2.0 * annz; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m,1) - + (nzlmax+nzumax)*iword + n ); - } else { - SetupSpace(work, lwork, &Glu_freeable->MemModel); - } - -#if ( PRNTlevel>=2 ) - printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", - annz, nzlmax, nzumax); -#endif - - /* Integer pointers for L\U factors */ - if ( Glu_freeable->MemModel == SYSTEM ) { - xsup = intMalloc_dist(n+1); - supno = intMalloc_dist(n+1); - xlsub = intMalloc_dist(n+1); - xusub = intMalloc_dist(n+1); - } else { - xsup = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - supno = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - xlsub = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - xusub = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - } - - lsub = (int_t *) expand(&nzlmax, LSUB, 0, 0, Glu_freeable); - usub = (int_t *) expand(&nzumax, USUB, 0, 0, Glu_freeable); - - while ( !lsub || !usub ) { - if ( Glu_freeable->MemModel == SYSTEM ) { - SUPERLU_FREE(lsub); - SUPERLU_FREE(usub); - } else { - user_free_dist((nzlmax+nzumax)*iword, HEAD); - } - nzlmax /= 2; - nzumax /= 2; - if ( nzumax < annz/2 ) { - printf("Not enough memory to perform factorization.\n"); - return (memory_usage(nzlmax, nzumax, n) + n); - } -#if ( PRNTlevel>=1 ) - printf(".. symbfact_SubInit() reduce size:" - "nzlmax %ld, nzumax %ld\n", nzlmax, nzumax); - fflush(stdout); -#endif - lsub = (int_t *) expand( &nzlmax, LSUB, 0, 0, Glu_freeable ); - usub = (int_t *) expand( &nzumax, USUB, 0, 1, Glu_freeable ); - } - - Glu_persist->xsup = xsup; - Glu_persist->supno = supno; - Glu_freeable->lsub = lsub; - Glu_freeable->xlsub = xlsub; - Glu_freeable->usub = usub; - Glu_freeable->xusub = xusub; - Glu_freeable->nzlmax = nzlmax; - Glu_freeable->nzumax = nzumax; - } else { - /* fact == SamePattern_SameRowPerm */ - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, 1) - + (nzlmax+nzumax)*iword + n ); - } else if ( lwork == 0 ) { - Glu_freeable->MemModel = SYSTEM; - } else { - Glu_freeable->MemModel = USER; - stack.top2 = (lwork/4)*4; /* must be word-addressable */ - stack.size = stack.top2; - } - - expanders[USUB].mem = Glu_freeable->usub; - expanders[LSUB].mem = Glu_freeable->lsub; - expanders[USUB].size = nzumax; - expanders[LSUB].size = nzlmax; - } - - ++no_expand; - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: xsup, supno */ - CHECK_MALLOC(iam, "Exit symbfact_SubInit()"); -#endif - - return 0; - -} /* SYMBFACT_SUBINIT */ - -/* - * Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -/************************************************************************/ -int_t symbfact_SubXpand -/************************************************************************/ -( - int_t n, /* total number of columns */ - int_t jcol, /* current column */ - int_t next, /* number of elements currently in the factors */ - MemType mem_type, /* which type of memory to expand */ - int_t *maxlen, /* modified - maximum length of a data structure */ - Glu_freeable_t *Glu_freeable /* modified - global LU data structures */ - ) -{ - void *new_mem; - -#if ( DEBUGlevel>=1 ) - printf("symbfact_SubXpand(): jcol %d, next %ld, maxlen %ld, MemType %d\n", - jcol, next, *maxlen, mem_type); -#endif - - new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable); - - if ( !new_mem ) { - int_t nzlmax = Glu_freeable->nzlmax; - int_t nzumax = Glu_freeable->nzumax; - fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); - return (memory_usage(nzlmax, nzumax, n) + n); - } - - if ( mem_type == LSUB ) { - Glu_freeable->lsub = (int_t *) new_mem; - Glu_freeable->nzlmax = *maxlen; - } else if ( mem_type == USUB ) { - Glu_freeable->usub = (int_t *) new_mem; - Glu_freeable->nzumax = *maxlen; - } else ABORT("Tries to expand nonexisting memory type.\n"); - - return 0; - -} /* LUSUB_XPAND */ - -/* - * Deallocate storage of the data structures common to symbolic - * factorization routines. - */ -/************************************************************************/ -int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable) -/************************************************************************/ -{ -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter symbfact_SubFree()"); -#endif - - SUPERLU_FREE(expanders); - SUPERLU_FREE(Glu_freeable->lsub); - SUPERLU_FREE(Glu_freeable->xlsub); - SUPERLU_FREE(Glu_freeable->usub); - SUPERLU_FREE(Glu_freeable->xusub); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit symbfact_SubFree()"); -#endif - return 0; -} /* SYMBFACT_SUBFREE */ - - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -/************************************************************************/ -static void *expand -/************************************************************************/ -( - int_t *prev_len, /* length used from previous call */ - MemType type, /* which part of the memory to expand */ - int_t len_to_copy, /* size of the memory to be copied to new store */ - int_t keep_prev, /* = 1: use prev_len; - = 0: compute new_len to expand */ - Glu_freeable_t *Glu_freeable /* modified - global LU data structures */ - ) -{ - float EXPAND = 1.5; - float alpha; - void *new_mem; - int_t new_len, tries, lword, extra, bytes_to_copy; - - alpha = EXPAND; - lword = sizeof(int_t); - - if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ - new_len = *prev_len; - else { - new_len = alpha * *prev_len; - } - - if ( Glu_freeable->MemModel == SYSTEM ) { - new_mem = (void *) SUPERLU_MALLOC((size_t) new_len * lword); - /*new_mem = (void *) calloc(new_len, lword); */ - if ( no_expand != 0 ) { - tries = 0; - if ( keep_prev ) { - if ( !new_mem ) return (NULL); - } else { - while ( !new_mem ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - new_mem = (void*) SUPERLU_MALLOC((size_t)new_len * lword); - /* new_mem = (void *) calloc(new_len, lword); */ - } - } - copy_mem_int(len_to_copy, expanders[type].mem, new_mem); - SUPERLU_FREE (expanders[type].mem); - } - expanders[type].mem = (void *) new_mem; - - } else { /* MemModel == USER */ - if ( no_expand == 0 ) { - new_mem = user_malloc_dist((size_t)new_len * lword, HEAD); - expanders[type].mem = (void *) new_mem; - } - else { - tries = 0; - extra = (new_len - *prev_len) * lword; - if ( keep_prev ) { - if ( StackFull(extra) ) return (NULL); - } else { - while ( StackFull(extra) ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - extra = (new_len - *prev_len) * lword; - } - } - - if ( type != USUB ) { - new_mem = (void*)((char*)expanders[type + 1].mem + extra); - bytes_to_copy = (char*)stack.array + stack.top1 - - (char*)expanders[type + 1].mem; - user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); - - if ( type < USUB ) { - Glu_freeable->usub = expanders[USUB].mem = - (void*)((char*)expanders[USUB].mem + extra); - } - if ( type < LSUB ) { - Glu_freeable->lsub = expanders[LSUB].mem = - (void*)((char*)expanders[LSUB].mem + extra); - } - stack.top1 += extra; - stack.used += extra; - - } /* if ... */ - - } /* else ... */ - } - - expanders[type].size = new_len; - *prev_len = new_len; - if ( no_expand ) ++no_expand; - - return (void *) expanders[type].mem; - -} /* EXPAND */ - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -/************************************************************************/ -int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable, - mem_usage_t *mem_usage) -/************************************************************************/ -{ - register int_t iword = sizeof(int_t); - extern int_t no_expand; - - /* For the adjacency graphs of L and U. */ - /*mem_usage->for_lu = (float)( (4*n + 3) * iword + - Glu_freeable->xlsub[n]*iword );*/ - mem_usage->for_lu = (float)( (4*n + 3) * iword + - lsub_size * iword ); - mem_usage->for_lu += (float)( (n + 1) * iword + - Glu_freeable->xusub[n]*iword ); - - /* Working storage to support factorization */ - mem_usage->total = mem_usage->for_lu + 9*n*iword; - - mem_usage->expansions = --no_expand; - return 0; -} /* QUERYSPACE_DIST */ - -static int_t -memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n) -{ - register int_t iword = sizeof(int_t); - return (10*n*iword + (nzlmax+nzumax)*iword); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c.orig hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c.orig --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c.orig 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c.orig 1970-01-01 00:00:00.000000000 +0000 @@ -1,546 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - -#define PRNTlevel 0 -/* - * Global variables - */ -ExpHeader *expanders; /* Array of pointers to 4 types of memory */ -LU_stack_t stack; -int_t no_expand; - - -/* - * Prototype - */ -static int_t memory_usage(const int_t, const int_t, const int_t); -static void *expand(int_t *, MemType, int_t, int_t, - Glu_freeable_t *); - -/* - * Internal prototypes - */ -void SetupSpace (void *, int_t, LU_space_t *); - - -void -superlu_abort_and_exit_dist(char *msg) -{ - /*fprintf(stderr, msg); - fflush(stderr);*/ - printf(msg); - exit (-1); -} - -long int superlu_malloc_total = 0; - -#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ - -#define PAD_FACTOR 2 -#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ - -void *superlu_malloc_dist(size_t size) -{ - char *buf; - int iam; - - MPI_Comm_rank(MPI_COMM_WORLD, &iam); - buf = (char *) malloc(size + DWORD); - if ( !buf ) { - printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %d\n", - iam, superlu_malloc_total*1e-6, size); - ABORT("superlu_malloc: out of memory"); - } - - ((int_t *) buf)[0] = size; -#if 0 - superlu_malloc_total += size + DWORD; -#else - superlu_malloc_total += size; -#endif - return (void *) (buf + DWORD); -} - -void superlu_free_dist(void *addr) -{ - char *p = ((char *) addr) - DWORD; - - if ( !addr ) - ABORT("superlu_free: tried to free NULL pointer"); - - if ( !p ) - ABORT("superlu_free: tried to free NULL+DWORD pointer"); - - { - int_t n = ((int_t *) p)[0]; - - if ( !n ) - ABORT("superlu_free: tried to free a freed pointer"); - *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ -#if 0 - superlu_malloc_total -= (n + DWORD); -#else - superlu_malloc_total -= n; -#endif - - if ( superlu_malloc_total < 0 ) - ABORT("superlu_malloc_total went negative"); - - /*free (addr);*/ - free (p); - } - -} - -#else /* The production mode. */ - -void *superlu_malloc_dist(size_t size) -{ - void *buf; - buf = (void *) malloc(size); - return (buf); -} - -void superlu_free_dist(void *addr) -{ - free (addr); -} - -#endif /* End debug malloc/free. */ - - - -static void -copy_mem_int(int_t howmany, void *old, void *new) -{ - register int_t i; - int_t *iold = old; - int_t *inew = new; - for (i = 0; i < howmany; i++) inew[i] = iold[i]; -} - - -static void -user_bcopy(char *src, char *dest, int_t bytes) -{ - char *s_ptr, *d_ptr; - - s_ptr = src + bytes - 1; - d_ptr = dest + bytes - 1; - for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; -} - - - -int_t *intMalloc_dist(int_t n) -{ - int_t *buf; - buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); - return (buf); -} - -int_t *intCalloc_dist(int_t n) -{ - int_t *buf; - register int_t i; - buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); - if ( buf ) - for (i = 0; i < n; ++i) buf[i] = 0; - return (buf); -} - - -void *user_malloc_dist(int_t bytes, int_t which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - -void user_free_dist(int_t bytes, int_t which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - -/* - * Setup the memory model to be used for factorization. - * lwork = 0: use system malloc; - * lwork > 0: use user-supplied work[] space. - */ -void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel) -{ - if ( lwork == 0 ) { - *MemModel = SYSTEM; /* malloc/free */ - } else if ( lwork > 0 ) { - *MemModel = USER; /* user provided space */ - stack.used = 0; - stack.top1 = 0; - stack.top2 = (lwork/4)*4; /* must be word addressable */ - stack.size = stack.top2; - stack.array = (void *) work; - } -} - - - -/* - * Allocate storage for the data structures common to symbolic factorization - * routines. For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * If lwork = -1, return the estimated amount of space required, plus n; - * otherwise, return the amount of space actually allocated when - * memory allocation failure occurred. - */ -/************************************************************************/ -int_t symbfact_SubInit -/************************************************************************/ -( - fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable - ) -{ - int_t iword; - int_t *xsup, *supno; - int_t *lsub, *xlsub; - int_t *usub, *xusub; - int_t nzlmax, nzumax; - int_t FILL = sp_ienv_dist(6); - -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter symbfact_SubInit()"); -#endif - - no_expand = 0; - iword = sizeof(int_t); - - expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) ); - if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); - - if ( fact == DOFACT || fact == SamePattern ) { - /* Guess for L\U factors */ - nzlmax = FILL * annz; - nzumax = FILL/2.0 * annz; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m,1) - + (nzlmax+nzumax)*iword + n ); - } else { - SetupSpace(work, lwork, &Glu_freeable->MemModel); - } - -#if ( PRNTlevel>=1 ) - printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", - annz, nzlmax, nzumax); -#endif - - /* Integer pointers for L\U factors */ - if ( Glu_freeable->MemModel == SYSTEM ) { - xsup = intMalloc_dist(n+1); - supno = intMalloc_dist(n+1); - xlsub = intMalloc_dist(n+1); - xusub = intMalloc_dist(n+1); - } else { - xsup = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - supno = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - xlsub = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - xusub = (int_t *)user_malloc_dist((n+1) * iword, HEAD); - } - - lsub = (int_t *) expand(&nzlmax, LSUB, 0, 0, Glu_freeable); - usub = (int_t *) expand(&nzumax, USUB, 0, 0, Glu_freeable); - - while ( !lsub || !usub ) { - if ( Glu_freeable->MemModel == SYSTEM ) { - SUPERLU_FREE(lsub); - SUPERLU_FREE(usub); - } else { - user_free_dist((nzlmax+nzumax)*iword, HEAD); - } - nzlmax /= 2; - nzumax /= 2; - if ( nzumax < annz/2 ) { - printf("Not enough memory to perform factorization.\n"); - return (memory_usage(nzlmax, nzumax, n) + n); - } -#if ( PRNTlevel>=1 ) - printf(".. symbfact_SubInit() reduce size:" - "nzlmax %ld, nzumax %ld\n", nzlmax, nzumax); - fflush(stdout); -#endif - lsub = (int_t *) expand( &nzlmax, LSUB, 0, 0, Glu_freeable ); - usub = (int_t *) expand( &nzumax, USUB, 0, 1, Glu_freeable ); - } - - Glu_persist->xsup = xsup; - Glu_persist->supno = supno; - Glu_freeable->lsub = lsub; - Glu_freeable->xlsub = xlsub; - Glu_freeable->usub = usub; - Glu_freeable->xusub = xusub; - Glu_freeable->nzlmax = nzlmax; - Glu_freeable->nzumax = nzumax; - } else { - /* fact == SamePattern_SameRowPerm */ - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, 1) - + (nzlmax+nzumax)*iword + n ); - } else if ( lwork == 0 ) { - Glu_freeable->MemModel = SYSTEM; - } else { - Glu_freeable->MemModel = USER; - stack.top2 = (lwork/4)*4; /* must be word-addressable */ - stack.size = stack.top2; - } - - expanders[USUB].mem = Glu_freeable->usub; - expanders[LSUB].mem = Glu_freeable->lsub; - expanders[USUB].size = nzumax; - expanders[LSUB].size = nzlmax; - } - - ++no_expand; - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: xsup, supno */ - CHECK_MALLOC(iam, "Exit symbfact_SubInit()"); -#endif - - return 0; - -} /* SYMBFACT_SUBINIT */ - -/* - * Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -/************************************************************************/ -int_t symbfact_SubXpand -/************************************************************************/ -( - int_t n, /* total number of columns */ - int_t jcol, /* current column */ - int_t next, /* number of elements currently in the factors */ - MemType mem_type, /* which type of memory to expand */ - int_t *maxlen, /* modified - maximum length of a data structure */ - Glu_freeable_t *Glu_freeable /* modified - global LU data structures */ - ) -{ - void *new_mem; - -#if ( DEBUGlevel>=1 ) - printf("symbfact_SubXpand(): jcol %d, next %ld, maxlen %ld, MemType %d\n", - jcol, next, *maxlen, mem_type); -#endif - - new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable); - - if ( !new_mem ) { - int_t nzlmax = Glu_freeable->nzlmax; - int_t nzumax = Glu_freeable->nzumax; - fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); - return (memory_usage(nzlmax, nzumax, n) + n); - } - - if ( mem_type == LSUB ) { - Glu_freeable->lsub = (int_t *) new_mem; - Glu_freeable->nzlmax = *maxlen; - } else if ( mem_type == USUB ) { - Glu_freeable->usub = (int_t *) new_mem; - Glu_freeable->nzumax = *maxlen; - } else ABORT("Tries to expand nonexisting memory type.\n"); - - return 0; - -} /* LUSUB_XPAND */ - -/* - * Deallocate storage of the data structures common to symbolic - * factorization routines. - */ -/************************************************************************/ -int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable) -/************************************************************************/ -{ -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter symbfact_SubFree()"); -#endif - - SUPERLU_FREE(expanders); - SUPERLU_FREE(Glu_freeable->lsub); - SUPERLU_FREE(Glu_freeable->xlsub); - SUPERLU_FREE(Glu_freeable->usub); - SUPERLU_FREE(Glu_freeable->xusub); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit symbfact_SubFree()"); -#endif - return 0; -} /* SYMBFACT_SUBFREE */ - - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -/************************************************************************/ -static void *expand -/************************************************************************/ -( - int_t *prev_len, /* length used from previous call */ - MemType type, /* which part of the memory to expand */ - int_t len_to_copy, /* size of the memory to be copied to new store */ - int_t keep_prev, /* = 1: use prev_len; - = 0: compute new_len to expand */ - Glu_freeable_t *Glu_freeable /* modified - global LU data structures */ - ) -{ - float EXPAND = 1.5; - float alpha; - void *new_mem; - int_t new_len, tries, lword, extra, bytes_to_copy; - - alpha = EXPAND; - lword = sizeof(int_t); - - if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ - new_len = *prev_len; - else { - new_len = alpha * *prev_len; - } - - if ( Glu_freeable->MemModel == SYSTEM ) { - new_mem = (void *) SUPERLU_MALLOC(new_len * lword); - /*new_mem = (void *) calloc(new_len, lword); */ - if ( no_expand != 0 ) { - tries = 0; - if ( keep_prev ) { - if ( !new_mem ) return (NULL); - } else { - while ( !new_mem ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - new_mem = (void *) SUPERLU_MALLOC(new_len * lword); - /* new_mem = (void *) calloc(new_len, lword); */ - } - } - copy_mem_int(len_to_copy, expanders[type].mem, new_mem); - SUPERLU_FREE (expanders[type].mem); - } - expanders[type].mem = (void *) new_mem; - - } else { /* MemModel == USER */ - if ( no_expand == 0 ) { - new_mem = user_malloc_dist(new_len * lword, HEAD); - expanders[type].mem = (void *) new_mem; - } - else { - tries = 0; - extra = (new_len - *prev_len) * lword; - if ( keep_prev ) { - if ( StackFull(extra) ) return (NULL); - } else { - while ( StackFull(extra) ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - extra = (new_len - *prev_len) * lword; - } - } - - if ( type != USUB ) { - new_mem = (void*)((char*)expanders[type + 1].mem + extra); - bytes_to_copy = (char*)stack.array + stack.top1 - - (char*)expanders[type + 1].mem; - user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); - - if ( type < USUB ) { - Glu_freeable->usub = expanders[USUB].mem = - (void*)((char*)expanders[USUB].mem + extra); - } - if ( type < LSUB ) { - Glu_freeable->lsub = expanders[LSUB].mem = - (void*)((char*)expanders[LSUB].mem + extra); - } - stack.top1 += extra; - stack.used += extra; - - } /* if ... */ - - } /* else ... */ - } - - expanders[type].size = new_len; - *prev_len = new_len; - if ( no_expand ) ++no_expand; - - return (void *) expanders[type].mem; - -} /* EXPAND */ - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -/************************************************************************/ -int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable, - mem_usage_t *mem_usage) -/************************************************************************/ -{ - register int_t iword = sizeof(int_t); - extern int_t no_expand; - - /* For the adjacency graphs of L and U. */ - /*mem_usage->for_lu = (float)( (4*n + 3) * iword + - Glu_freeable->xlsub[n]*iword );*/ - mem_usage->for_lu = (float)( (4*n + 3) * iword + - lsub_size * iword ); - mem_usage->for_lu += (float)( (n + 1) * iword + - Glu_freeable->xusub[n]*iword ); - - /* Working storage to support factorization */ - mem_usage->total = mem_usage->for_lu + 9*n*iword; - - mem_usage->expansions = --no_expand; - return 0; -} /* QUERYSPACE_DIST */ - -static int_t -memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n) -{ - register int_t iword = sizeof(int_t); - return (10*n*iword + (nzlmax+nzumax)*iword); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.patch hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.patch --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.patch 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,10 +0,0 @@ -118d117 -< -144c143 -< buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); ---- -> buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t)); -152c151 -< buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); ---- -> buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t)); diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mmd.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mmd.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mmd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mmd.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1019 +0,0 @@ -#ifdef _CRAY -#define int short -typedef short shortint; -#elif defined (_LONGINT) -#define int long -typedef long shortint; -#else -typedef int shortint; -#endif - -/* *************************************************************** */ -/* *************************************************************** */ -/* **** GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE **** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */ -/* ALGORITHM. IT MAKES USE OF THE IMPLICIT REPRESENTATION */ -/* OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */ -/* NOTION OF INDISTINGUISHABLE NODES. IT ALSO IMPLEMENTS */ -/* THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */ -/* EXTERNAL DEGREE. */ -/* --------------------------------------------- */ -/* CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */ -/* DESTROYED. */ -/* --------------------------------------------- */ - -/* INPUT PARAMETERS - */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */ -/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ -/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */ -/* (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */ -/* NODES. */ - -/* OUTPUT PARAMETERS - */ -/* PERM - THE MINIMUM DEGREE ORDERING. */ -/* INVP - THE INVERSE OF PERM. */ -/* NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */ -/* SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */ - -/* WORKING PARAMETERS - */ -/* DHEAD - VECTOR FOR HEAD OF DEGREE LISTS. */ -/* INVP - USED TEMPORARILY FOR DEGREE FORWARD LINK. */ -/* PERM - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */ -/* QSIZE - VECTOR FOR SIZE OF SUPERNODES. */ -/* LLIST - VECTOR FOR TEMPORARY LINKED LISTS. */ -/* MARKER - A TEMPORARY MARKER VECTOR. */ - -/* PROGRAM SUBROUTINES - */ -/* MMDELM, MMDINT, MMDNUM, MMDUPD. */ - -/* *************************************************************** */ - -/* Subroutine */ int genmmd_dist_(int *neqns, int *xadj, shortint *adjncy, - shortint *invp, shortint *perm, int *delta, shortint *dhead, - shortint *qsize, shortint *llist, shortint *marker, int *maxint, - int *nofsub) -{ - /* System generated locals */ - int i__1; - - /* Local variables */ - static int mdeg, ehead, i, mdlmt, mdnode; - extern /* Subroutine */ int mmdelm_dist(int *, int *, shortint *, - shortint *, shortint *, shortint *, shortint *, shortint *, - shortint *, int *, int *), mmdupd_dist(int *, int *, - int *, shortint *, int *, int *, shortint *, shortint - *, shortint *, shortint *, shortint *, shortint *, int *, - int *), mmdint_dist(int *, int *, shortint *, shortint *, - shortint *, shortint *, shortint *, shortint *, shortint *), - mmdnum_dist(int *, shortint *, shortint *, shortint *); - static int nextmd, tag, num; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dhead; - --perm; - --invp; - --adjncy; - --xadj; - - /* Function Body */ - if (*neqns <= 0) { - return 0; - } - -/* ------------------------------------------------ */ -/* INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */ -/* ------------------------------------------------ */ - *nofsub = 0; - mmdint_dist(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], - &qsize[1], &llist[1], &marker[1]); - -/* ---------------------------------------------- */ -/* NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */ -/* ---------------------------------------------- */ - num = 1; - -/* ----------------------------- */ -/* ELIMINATE ALL ISOLATED NODES. */ -/* ----------------------------- */ - nextmd = dhead[1]; -L100: - if (nextmd <= 0) { - goto L200; - } - mdnode = nextmd; - nextmd = invp[mdnode]; - marker[mdnode] = *maxint; - invp[mdnode] = -num; - ++num; - goto L100; - -L200: -/* ---------------------------------------- */ -/* SEARCH FOR NODE OF THE MINIMUM DEGREE. */ -/* MDEG IS THE CURRENT MINIMUM DEGREE; */ -/* TAG IS USED TO FACILITATE MARKING NODES. */ -/* ---------------------------------------- */ - if (num > *neqns) { - goto L1000; - } - tag = 1; - dhead[1] = 0; - mdeg = 2; -L300: - if (dhead[mdeg] > 0) { - goto L400; - } - ++mdeg; - goto L300; -L400: -/* ------------------------------------------------- */ -/* USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */ -/* WHEN A DEGREE UPDATE IS TO BE PERFORMED. */ -/* ------------------------------------------------- */ - mdlmt = mdeg + *delta; - ehead = 0; - -L500: - mdnode = dhead[mdeg]; - if (mdnode > 0) { - goto L600; - } - ++mdeg; - if (mdeg > mdlmt) { - goto L900; - } - goto L500; -L600: -/* ---------------------------------------- */ -/* REMOVE MDNODE FROM THE DEGREE STRUCTURE. */ -/* ---------------------------------------- */ - nextmd = invp[mdnode]; - dhead[mdeg] = nextmd; - if (nextmd > 0) { - perm[nextmd] = -mdeg; - } - invp[mdnode] = -num; - *nofsub = *nofsub + mdeg + qsize[mdnode] - 2; - if (num + qsize[mdnode] > *neqns) { - goto L1000; - } -/* ---------------------------------------------- */ -/* ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */ -/* TRANSFORMATION. RESET TAG VALUE IF NECESSARY. */ -/* ---------------------------------------------- */ - ++tag; - if (tag < *maxint) { - goto L800; - } - tag = 1; - i__1 = *neqns; - for (i = 1; i <= i__1; ++i) { - if (marker[i] < *maxint) { - marker[i] = 0; - } -/* L700: */ - } -L800: - mmdelm_dist(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], - &qsize[1], &llist[1], &marker[1], maxint, &tag); - num += qsize[mdnode]; - llist[mdnode] = ehead; - ehead = mdnode; - if (*delta >= 0) { - goto L500; - } -L900: -/* ------------------------------------------- */ -/* UPDATE DEGREES OF THE NODES INVOLVED IN THE */ -/* MINIMUM DEGREE NODES ELIMINATION. */ -/* ------------------------------------------- */ - if (num > *neqns) { - goto L1000; - } - mmdupd_dist(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], - &invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, - &tag); - goto L300; - -L1000: - mmdnum_dist(neqns, &perm[1], &invp[1], &qsize[1]); - return 0; - -} /* genmmd_dist_ */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* *** MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION *** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */ -/* MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */ -/* ALGORITHM. */ - -/* INPUT PARAMETERS - */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ - -/* OUTPUT PARAMETERS - */ -/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ -/* QSIZE - SIZE OF SUPERNODE (INITIALIZED TO ONE). */ -/* LLIST - LINKED LIST. */ -/* MARKER - MARKER VECTOR. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdint_dist(int *neqns, int *xadj, shortint *adjncy, - shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, - shortint *llist, shortint *marker) -{ - /* System generated locals */ - int i__1; - - /* Local variables */ - static int ndeg, node, fnode; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dbakw; - --dforw; - --dhead; - --adjncy; - --xadj; - - /* Function Body */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - dhead[node] = 0; - qsize[node] = 1; - marker[node] = 0; - llist[node] = 0; -/* L100: */ - } -/* ------------------------------------------ */ -/* INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */ -/* ------------------------------------------ */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - ndeg = xadj[node + 1] - xadj[node] + 1; - fnode = dhead[ndeg]; - dforw[node] = fnode; - dhead[ndeg] = node; - if (fnode > 0) { - dbakw[fnode] = node; - } - dbakw[node] = -ndeg; -/* L200: */ - } - return 0; - -} /* mmdint_dist */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* ** MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION *** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */ -/* MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */ -/* IS STORED IN THE QUOTIENT GRAPH FORMAT. IT ALSO */ -/* TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */ -/* ELIMINATION GRAPH. */ - -/* INPUT PARAMETERS - */ -/* MDNODE - NODE OF MINIMUM DEGREE. */ -/* MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */ -/* INT. */ -/* TAG - TAG VALUE. */ - -/* UPDATED PARAMETERS - */ -/* (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */ -/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ -/* QSIZE - SIZE OF SUPERNODE. */ -/* MARKER - MARKER VECTOR. */ -/* LLIST - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdelm_dist(int *mdnode, int *xadj, shortint *adjncy, - shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, - shortint *llist, shortint *marker, int *maxint, int *tag) -{ - /* System generated locals */ - int i__1, i__2; - - /* Local variables */ - static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, - istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - -/* ----------------------------------------------- */ -/* FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */ -/* ----------------------------------------------- */ - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dbakw; - --dforw; - --dhead; - --adjncy; - --xadj; - - /* Function Body */ - marker[*mdnode] = *tag; - istrt = xadj[*mdnode]; - istop = xadj[*mdnode + 1] - 1; -/* ------------------------------------------------------- */ -/* ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */ -/* NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */ -/* FOR THE NEXT REACHABLE NODE. */ -/* ------------------------------------------------------- */ - elmnt = 0; - rloc = istrt; - rlmt = istop; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - nabor = adjncy[i]; - if (nabor == 0) { - goto L300; - } - if (marker[nabor] >= *tag) { - goto L200; - } - marker[nabor] = *tag; - if (dforw[nabor] < 0) { - goto L100; - } - adjncy[rloc] = nabor; - ++rloc; - goto L200; -L100: - llist[nabor] = elmnt; - elmnt = nabor; -L200: - ; - } -L300: -/* ----------------------------------------------------- */ -/* MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */ -/* ----------------------------------------------------- */ - if (elmnt <= 0) { - goto L1000; - } - adjncy[rlmt] = -elmnt; - link = elmnt; -L400: - jstrt = xadj[link]; - jstop = xadj[link + 1] - 1; - i__1 = jstop; - for (j = jstrt; j <= i__1; ++j) { - node = adjncy[j]; - link = -node; - if (node < 0) { - goto L400; - } else if (node == 0) { - goto L900; - } else { - goto L500; - } -L500: - if (marker[node] >= *tag || dforw[node] < 0) { - goto L800; - } - marker[node] = *tag; -/* --------------------------------- */ -/* USE STORAGE FROM ELIMINATED NODES */ -/* IF NECESSARY. */ -/* --------------------------------- */ -L600: - if (rloc < rlmt) { - goto L700; - } - link = -adjncy[rlmt]; - rloc = xadj[link]; - rlmt = xadj[link + 1] - 1; - goto L600; -L700: - adjncy[rloc] = node; - ++rloc; -L800: - ; - } -L900: - elmnt = llist[elmnt]; - goto L300; -L1000: - if (rloc <= rlmt) { - adjncy[rloc] = 0; - } -/* -------------------------------------------------------- */ -/* FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */ -/* -------------------------------------------------------- */ - link = *mdnode; -L1100: - istrt = xadj[link]; - istop = xadj[link + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - rnode = adjncy[i]; - link = -rnode; - if (rnode < 0) { - goto L1100; - } else if (rnode == 0) { - goto L1800; - } else { - goto L1200; - } -L1200: -/* -------------------------------------------- */ -/* IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */ -/* -------------------------------------------- */ - pvnode = dbakw[rnode]; - if (pvnode == 0 || pvnode == -(*maxint)) { - goto L1300; - } -/* ------------------------------------- */ -/* THEN REMOVE RNODE FROM THE STRUCTURE. */ -/* ------------------------------------- */ - nxnode = dforw[rnode]; - if (nxnode > 0) { - dbakw[nxnode] = pvnode; - } - if (pvnode > 0) { - dforw[pvnode] = nxnode; - } - npv = -pvnode; - if (pvnode < 0) { - dhead[npv] = nxnode; - } -L1300: -/* ---------------------------------------- */ -/* PURGE INACTIVE QUOTIENT NABORS OF RNODE. */ -/* ---------------------------------------- */ - jstrt = xadj[rnode]; - jstop = xadj[rnode + 1] - 1; - xqnbr = jstrt; - i__2 = jstop; - for (j = jstrt; j <= i__2; ++j) { - nabor = adjncy[j]; - if (nabor == 0) { - goto L1500; - } - if (marker[nabor] >= *tag) { - goto L1400; - } - adjncy[xqnbr] = nabor; - ++xqnbr; -L1400: - ; - } -L1500: -/* ---------------------------------------- */ -/* IF NO ACTIVE NABOR AFTER THE PURGING ... */ -/* ---------------------------------------- */ - nqnbrs = xqnbr - jstrt; - if (nqnbrs > 0) { - goto L1600; - } -/* ----------------------------- */ -/* THEN MERGE RNODE WITH MDNODE. */ -/* ----------------------------- */ - qsize[*mdnode] += qsize[rnode]; - qsize[rnode] = 0; - marker[rnode] = *maxint; - dforw[rnode] = -(*mdnode); - dbakw[rnode] = -(*maxint); - goto L1700; -L1600: -/* -------------------------------------- */ -/* ELSE FLAG RNODE FOR DEGREE UPDATE, AND */ -/* ADD MDNODE AS A NABOR OF RNODE. */ -/* -------------------------------------- */ - dforw[rnode] = nqnbrs + 1; - dbakw[rnode] = 0; - adjncy[xqnbr] = *mdnode; - ++xqnbr; - if (xqnbr <= jstop) { - adjncy[xqnbr] = 0; - } - -L1700: - ; - } -L1800: - return 0; - -} /* mmdelm_dist */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* ***** MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE ***** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */ -/* AFTER A MULTIPLE ELIMINATION STEP. */ - -/* INPUT PARAMETERS - */ -/* EHEAD - THE BEGINNING OF THE LIST OF ELIMINATED */ -/* NODES (I.E., NEWLY FORMED ELEMENTS). */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ -/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ -/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */ -/* INTEGER. */ - -/* UPDATED PARAMETERS - */ -/* MDEG - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */ -/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ -/* QSIZE - SIZE OF SUPERNODE. */ -/* LLIST - WORKING LINKED LIST. */ -/* MARKER - MARKER VECTOR FOR DEGREE UPDATE. */ -/* TAG - TAG VALUE. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdupd_dist(int *ehead, int *neqns, int *xadj, - shortint *adjncy, int *delta, int *mdeg, shortint *dhead, - shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, - shortint *marker, int *maxint, int *tag) -{ - /* System generated locals */ - int i__1, i__2; - - /* Local variables */ - static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, - istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dbakw; - --dforw; - --dhead; - --adjncy; - --xadj; - - /* Function Body */ - mdeg0 = *mdeg + *delta; - elmnt = *ehead; -L100: -/* ------------------------------------------------------- */ -/* FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */ -/* (RESET TAG VALUE IF NECESSARY.) */ -/* ------------------------------------------------------- */ - if (elmnt <= 0) { - return 0; - } - mtag = *tag + mdeg0; - if (mtag < *maxint) { - goto L300; - } - *tag = 1; - i__1 = *neqns; - for (i = 1; i <= i__1; ++i) { - if (marker[i] < *maxint) { - marker[i] = 0; - } -/* L200: */ - } - mtag = *tag + mdeg0; -L300: -/* --------------------------------------------- */ -/* CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */ -/* WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */ -/* ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */ -/* THAN TWO NABORS (QXHEAD). ALSO COMPUTE DEG0, */ -/* NUMBER OF NODES IN THIS ELEMENT. */ -/* --------------------------------------------- */ - q2head = 0; - qxhead = 0; - deg0 = 0; - link = elmnt; -L400: - istrt = xadj[link]; - istop = xadj[link + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - enode = adjncy[i]; - link = -enode; - if (enode < 0) { - goto L400; - } else if (enode == 0) { - goto L800; - } else { - goto L500; - } - -L500: - if (qsize[enode] == 0) { - goto L700; - } - deg0 += qsize[enode]; - marker[enode] = mtag; -/* ---------------------------------- */ -/* IF ENODE REQUIRES A DEGREE UPDATE, */ -/* THEN DO THE FOLLOWING. */ -/* ---------------------------------- */ - if (dbakw[enode] != 0) { - goto L700; - } -/* --------------------------------------- -*/ -/* PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. -*/ -/* --------------------------------------- -*/ - if (dforw[enode] == 2) { - goto L600; - } - llist[enode] = qxhead; - qxhead = enode; - goto L700; -L600: - llist[enode] = q2head; - q2head = enode; -L700: - ; - } -L800: -/* -------------------------------------------- */ -/* FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */ -/* -------------------------------------------- */ - enode = q2head; - iq2 = 1; -L900: - if (enode <= 0) { - goto L1500; - } - if (dbakw[enode] != 0) { - goto L2200; - } - ++(*tag); - deg = deg0; -/* ------------------------------------------ */ -/* IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */ -/* ------------------------------------------ */ - istrt = xadj[enode]; - nabor = adjncy[istrt]; - if (nabor == elmnt) { - nabor = adjncy[istrt + 1]; - } -/* ------------------------------------------------ */ -/* IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */ -/* ------------------------------------------------ */ - link = nabor; - if (dforw[nabor] < 0) { - goto L1000; - } - deg += qsize[nabor]; - goto L2100; -L1000: -/* -------------------------------------------- */ -/* OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */ -/* DO THE FOLLOWING. */ -/* -------------------------------------------- */ - istrt = xadj[link]; - istop = xadj[link + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - node = adjncy[i]; - link = -node; - if (node == enode) { - goto L1400; - } - if (node < 0) { - goto L1000; - } else if (node == 0) { - goto L2100; - } else { - goto L1100; - } - -L1100: - if (qsize[node] == 0) { - goto L1400; - } - if (marker[node] >= *tag) { - goto L1200; - } -/* ----------------------------------- --- */ -/* CASE WHEN NODE IS NOT YET CONSIDERED -. */ -/* ----------------------------------- --- */ - marker[node] = *tag; - deg += qsize[node]; - goto L1400; -L1200: -/* ---------------------------------------- - */ -/* CASE WHEN NODE IS INDISTINGUISHABLE FROM - */ -/* ENODE. MERGE THEM INTO A NEW SUPERNODE. - */ -/* ---------------------------------------- - */ - if (dbakw[node] != 0) { - goto L1400; - } - if (dforw[node] != 2) { - goto L1300; - } - qsize[enode] += qsize[node]; - qsize[node] = 0; - marker[node] = *maxint; - dforw[node] = -enode; - dbakw[node] = -(*maxint); - goto L1400; -L1300: -/* -------------------------------------- -*/ -/* CASE WHEN NODE IS OUTMATCHED BY ENODE. -*/ -/* -------------------------------------- -*/ - if (dbakw[node] == 0) { - dbakw[node] = -(*maxint); - } -L1400: - ; - } - goto L2100; -L1500: -/* ------------------------------------------------ */ -/* FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */ -/* ------------------------------------------------ */ - enode = qxhead; - iq2 = 0; -L1600: - if (enode <= 0) { - goto L2300; - } - if (dbakw[enode] != 0) { - goto L2200; - } - ++(*tag); - deg = deg0; -/* --------------------------------- */ -/* FOR EACH UNMARKED NABOR OF ENODE, */ -/* DO THE FOLLOWING. */ -/* --------------------------------- */ - istrt = xadj[enode]; - istop = xadj[enode + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - nabor = adjncy[i]; - if (nabor == 0) { - goto L2100; - } - if (marker[nabor] >= *tag) { - goto L2000; - } - marker[nabor] = *tag; - link = nabor; -/* ------------------------------ */ -/* IF UNELIMINATED, INCLUDE IT IN */ -/* DEG COUNT. */ -/* ------------------------------ */ - if (dforw[nabor] < 0) { - goto L1700; - } - deg += qsize[nabor]; - goto L2000; -L1700: -/* ------------------------------- -*/ -/* IF ELIMINATED, INCLUDE UNMARKED -*/ -/* NODES IN THIS ELEMENT INTO THE -*/ -/* DEGREE COUNT. */ -/* ------------------------------- -*/ - jstrt = xadj[link]; - jstop = xadj[link + 1] - 1; - i__2 = jstop; - for (j = jstrt; j <= i__2; ++j) { - node = adjncy[j]; - link = -node; - if (node < 0) { - goto L1700; - } else if (node == 0) { - goto L2000; - } else { - goto L1800; - } - -L1800: - if (marker[node] >= *tag) { - goto L1900; - } - marker[node] = *tag; - deg += qsize[node]; -L1900: - ; - } -L2000: - ; - } -L2100: -/* ------------------------------------------- */ -/* UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */ -/* STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */ -/* ------------------------------------------- */ - deg = deg - qsize[enode] + 1; - fnode = dhead[deg]; - dforw[enode] = fnode; - dbakw[enode] = -deg; - if (fnode > 0) { - dbakw[fnode] = enode; - } - dhead[deg] = enode; - if (deg < *mdeg) { - *mdeg = deg; - } -L2200: -/* ---------------------------------- */ -/* GET NEXT ENODE IN CURRENT ELEMENT. */ -/* ---------------------------------- */ - enode = llist[enode]; - if (iq2 == 1) { - goto L900; - } - goto L1600; -L2300: -/* ----------------------------- */ -/* GET NEXT ELEMENT IN THE LIST. */ -/* ----------------------------- */ - *tag = mtag; - elmnt = llist[elmnt]; - goto L100; - -} /* mmdupd_dist */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* ***** MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING ***** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */ -/* PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */ -/* VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */ -/* MINIMUM DEGREE ORDERING ALGORITHM. */ - -/* INPUT PARAMETERS - */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* QSIZE - SIZE OF SUPERNODES AT ELIMINATION. */ - -/* UPDATED PARAMETERS - */ -/* INVP - INVERSE PERMUTATION VECTOR. ON INPUT, */ -/* IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */ -/* INTO THE NODE -INVP(NODE); OTHERWISE, */ -/* -INVP(NODE) IS ITS INVERSE LABELLING. */ - -/* OUTPUT PARAMETERS - */ -/* PERM - THE PERMUTATION VECTOR. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdnum_dist(int *neqns, shortint *perm, shortint *invp, - shortint *qsize) -{ - /* System generated locals */ - int i__1; - - /* Local variables */ - static int node, root, nextf, father, nqsize, num; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --qsize; - --invp; - --perm; - - /* Function Body */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - nqsize = qsize[node]; - if (nqsize <= 0) { - perm[node] = invp[node]; - } - if (nqsize > 0) { - perm[node] = -invp[node]; - } -/* L100: */ - } -/* ------------------------------------------------------ */ -/* FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */ -/* ------------------------------------------------------ */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - if (perm[node] > 0) { - goto L500; - } -/* ----------------------------------------- */ -/* TRACE THE MERGED TREE UNTIL ONE WHICH HAS */ -/* NOT BEEN MERGED, CALL IT ROOT. */ -/* ----------------------------------------- */ - father = node; -L200: - if (perm[father] > 0) { - goto L300; - } - father = -perm[father]; - goto L200; -L300: -/* ----------------------- */ -/* NUMBER NODE AFTER ROOT. */ -/* ----------------------- */ - root = father; - num = perm[root] + 1; - invp[node] = -num; - perm[root] = num; -/* ------------------------ */ -/* SHORTEN THE MERGED TREE. */ -/* ------------------------ */ - father = node; -L400: - nextf = -perm[father]; - if (nextf <= 0) { - goto L500; - } - perm[father] = -root; - father = nextf; - goto L400; -L500: - ; - } -/* ---------------------- */ -/* READY TO COMPUTE PERM. */ -/* ---------------------- */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - num = -invp[node]; - invp[node] = num; - perm[num] = node; -/* L600: */ - } - return 0; - -} /* mmdnum_dist */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,2583 +0,0 @@ -/* ========================================================================== */ -/* === colamd - a sparse matrix column ordering algorithm =================== */ -/* ========================================================================== */ - -/* - colamd: An approximate minimum degree column ordering algorithm. - - Purpose: - - Colamd computes a permutation Q such that the Cholesky factorization of - (AQ)'(AQ) has less fill-in and requires fewer floating point operations - than A'A. This also provides a good ordering for sparse partial - pivoting methods, P(AQ) = LU, where Q is computed prior to numerical - factorization, and P is computed during numerical factorization via - conventional partial pivoting with row interchanges. Colamd is the - column ordering method used in SuperLU, part of the ScaLAPACK library. - It is also available as user-contributed software for Matlab 5.2, - available from MathWorks, Inc. (http://www.mathworks.com). This - routine can be used in place of COLMMD in Matlab. By default, the \ - and / operators in Matlab perform a column ordering (using COLMMD) - prior to LU factorization using sparse partial pivoting, in the - built-in Matlab LU(A) routine. - - Authors: - - The authors of the code itself are Stefan I. Larimore and Timothy A. - Davis (davis@cise.ufl.edu), University of Florida. The algorithm was - developed in collaboration with John Gilbert, Xerox PARC, and Esmond - Ng, Oak Ridge National Laboratory. - - Date: - - August 3, 1998. Version 1.0. - - Acknowledgements: - - This work was supported by the National Science Foundation, under - grants DMS-9504974 and DMS-9803599. - - Notice: - - Copyright (c) 1998 by the University of Florida. All Rights Reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - User documentation of any code that uses this code must cite the - Authors, the Copyright, and "Used by permission." If this code is - accessible from within Matlab, then typing "help colamd" or "colamd" - (with no arguments) must cite the Authors. Permission to modify the - code and to distribute modified code is granted, provided the above - notices are retained, and a notice that the code was modified is - included with the above copyright notice. You must also retain the - Availability information below, of the original version. - - This software is provided free of charge. - - Availability: - - This file is located at - - http://www.cise.ufl.edu/~davis/colamd/colamd.c - - The colamd.h file is required, located in the same directory. - The colamdmex.c file provides a Matlab interface for colamd. - The symamdmex.c file provides a Matlab interface for symamd, which is - a symmetric ordering based on this code, colamd.c. All codes are - purely ANSI C compliant (they use no Unix-specific routines, include - files, etc.). -*/ - -/* ========================================================================== */ -/* === Description of user-callable routines ================================ */ -/* ========================================================================== */ - -/* - Each user-callable routine (declared as PUBLIC) is briefly described below. - Refer to the comments preceding each routine for more details. - - ---------------------------------------------------------------------------- - colamd_recommended: - ---------------------------------------------------------------------------- - - Usage: - - Alen = colamd_recommended (nnz, n_row, n_col) ; - - Purpose: - - Returns recommended value of Alen for use by colamd. Returns -1 - if any input argument is negative. - - Arguments: - - int nnz ; Number of nonzeros in the matrix A. This must - be the same value as p [n_col] in the call to - colamd - otherwise you will get a wrong value - of the recommended memory to use. - int n_row ; Number of rows in the matrix A. - int n_col ; Number of columns in the matrix A. - - ---------------------------------------------------------------------------- - colamd_set_defaults: - ---------------------------------------------------------------------------- - - Usage: - - colamd_set_defaults (knobs) ; - - Purpose: - - Sets the default parameters. - - Arguments: - - double knobs [COLAMD_KNOBS] ; Output only. - - Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries - are removed prior to ordering. Columns with more than - (knobs [COLAMD_DENSE_COL] * n_row) entries are removed - prior to ordering, and placed last in the output column - ordering. Default values of these two knobs are both 0.5. - Currently, only knobs [0] and knobs [1] are used, but future - versions may use more knobs. If so, they will be properly set - to their defaults by the future version of colamd_set_defaults, - so that the code that calls colamd will not need to change, - assuming that you either use colamd_set_defaults, or pass a - (double *) NULL pointer as the knobs array to colamd. - - ---------------------------------------------------------------------------- - colamd: - ---------------------------------------------------------------------------- - - Usage: - - colamd (n_row, n_col, Alen, A, p, knobs) ; - - Purpose: - - Computes a column ordering (Q) of A such that P(AQ)=LU or - (AQ)'AQ=LL' have less fill-in and require fewer floating point - operations than factorizing the unpermuted matrix A or A'A, - respectively. - - Arguments: - - int n_row ; - - Number of rows in the matrix A. - Restriction: n_row >= 0. - Colamd returns FALSE if n_row is negative. - - int n_col ; - - Number of columns in the matrix A. - Restriction: n_col >= 0. - Colamd returns FALSE if n_col is negative. - - int Alen ; - - Restriction (see note): - Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS - Colamd returns FALSE if these conditions are not met. - - Note: this restriction makes an modest assumption regarding - the size of the two typedef'd structures, below. We do, - however, guarantee that - Alen >= colamd_recommended (nnz, n_row, n_col) - will be sufficient. - - int A [Alen] ; Input argument, stats on output. - - A is an integer array of size Alen. Alen must be at least as - large as the bare minimum value given above, but this is very - low, and can result in excessive run time. For best - performance, we recommend that Alen be greater than or equal to - colamd_recommended (nnz, n_row, n_col), which adds - nnz/5 to the bare minimum value given above. - - On input, the row indices of the entries in column c of the - matrix are held in A [(p [c]) ... (p [c+1]-1)]. The row indices - in a given column c need not be in ascending order, and - duplicate row indices may be be present. However, colamd will - work a little faster if both of these conditions are met - (Colamd puts the matrix into this format, if it finds that the - the conditions are not met). - - The matrix is 0-based. That is, rows are in the range 0 to - n_row-1, and columns are in the range 0 to n_col-1. Colamd - returns FALSE if any row index is out of range. - - The contents of A are modified during ordering, and are thus - undefined on output with the exception of a few statistics - about the ordering (A [0..COLAMD_STATS-1]): - A [0]: number of dense or empty rows ignored. - A [1]: number of dense or empty columns ignored (and ordered - last in the output permutation p) - A [2]: number of garbage collections performed. - A [3]: 0, if all row indices in each column were in sorted - order, and no duplicates were present. - 1, otherwise (in which case colamd had to do more work) - Note that a row can become "empty" if it contains only - "dense" and/or "empty" columns, and similarly a column can - become "empty" if it only contains "dense" and/or "empty" rows. - Future versions may return more statistics in A, but the usage - of these 4 entries in A will remain unchanged. - - int p [n_col+1] ; Both input and output argument. - - p is an integer array of size n_col+1. On input, it holds the - "pointers" for the column form of the matrix A. Column c of - the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first - entry, p [0], must be zero, and p [c] <= p [c+1] must hold - for all c in the range 0 to n_col-1. The value p [n_col] is - thus the total number of entries in the pattern of the matrix A. - Colamd returns FALSE if these conditions are not met. - - On output, if colamd returns TRUE, the array p holds the column - permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is - the first column index in the new ordering, and p [n_col-1] is - the last. That is, p [k] = j means that column j of A is the - kth pivot column, in AQ, where k is in the range 0 to n_col-1 - (p [0] = j means that column j of A is the first column in AQ). - - If colamd returns FALSE, then no permutation is returned, and - p is undefined on output. - - double knobs [COLAMD_KNOBS] ; Input only. - - See colamd_set_defaults for a description. If the knobs array - is not present (that is, if a (double *) NULL pointer is passed - in its place), then the default values of the parameters are - used instead. - -*/ - - -/* ========================================================================== */ -/* === Include files ======================================================== */ -/* ========================================================================== */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include - -/* colamd.h: knob array size, stats output size, and global prototypes */ -#include "colamd.h" - -/* ========================================================================== */ -/* === Scaffolding code definitions ======================================== */ -/* ========================================================================== */ - -/* Ensure that debugging is turned off: */ -#ifndef NDEBUG -#define NDEBUG -#endif - -/* assert.h: the assert macro (no debugging if NDEBUG is defined) */ -#include - -/* - Our "scaffolding code" philosophy: In our opinion, well-written library - code should keep its "debugging" code, and just normally have it turned off - by the compiler so as not to interfere with performance. This serves - several purposes: - - (1) assertions act as comments to the reader, telling you what the code - expects at that point. All assertions will always be true (unless - there really is a bug, of course). - - (2) leaving in the scaffolding code assists anyone who would like to modify - the code, or understand the algorithm (by reading the debugging output, - one can get a glimpse into what the code is doing). - - (3) (gasp!) for actually finding bugs. This code has been heavily tested - and "should" be fully functional and bug-free ... but you never know... - - To enable debugging, comment out the "#define NDEBUG" above. The code will - become outrageously slow when debugging is enabled. To control the level of - debugging output, set an environment variable D to 0 (little), 1 (some), - 2, 3, or 4 (lots). -*/ - -/* ========================================================================== */ -/* === Row and Column structures ============================================ */ -/* ========================================================================== */ - -typedef struct ColInfo_struct -{ - int start ; /* index for A of first row in this column, or DEAD */ - /* if column is dead */ - int length ; /* number of rows in this column */ - union - { - int thickness ; /* number of original columns represented by this */ - /* col, if the column is alive */ - int parent ; /* parent in parent tree super-column structure, if */ - /* the column is dead */ - } shared1 ; - union - { - int score ; /* the score used to maintain heap, if col is alive */ - int order ; /* pivot ordering of this column, if col is dead */ - } shared2 ; - union - { - int headhash ; /* head of a hash bucket, if col is at the head of */ - /* a degree list */ - int hash ; /* hash value, if col is not in a degree list */ - int prev ; /* previous column in degree list, if col is in a */ - /* degree list (but not at the head of a degree list) */ - } shared3 ; - union - { - int degree_next ; /* next column, if col is in a degree list */ - int hash_next ; /* next column, if col is in a hash list */ - } shared4 ; - -} ColInfo ; - -typedef struct RowInfo_struct -{ - int start ; /* index for A of first col in this row */ - int length ; /* number of principal columns in this row */ - union - { - int degree ; /* number of principal & non-principal columns in row */ - int p ; /* used as a row pointer in init_rows_cols () */ - } shared1 ; - union - { - int mark ; /* for computing set differences and marking dead rows*/ - int first_column ;/* first column in row (used in garbage collection) */ - } shared2 ; - -} RowInfo ; - -/* ========================================================================== */ -/* === Definitions ========================================================== */ -/* ========================================================================== */ - -#define MAX(a,b) (((a) > (b)) ? (a) : (b)) -#define MIN(a,b) (((a) < (b)) ? (a) : (b)) - -#define ONES_COMPLEMENT(r) (-(r)-1) - -#define TRUE (1) -#define FALSE (0) -#define EMPTY (-1) - -/* Row and column status */ -#define ALIVE (0) -#define DEAD (-1) - -/* Column status */ -#define DEAD_PRINCIPAL (-1) -#define DEAD_NON_PRINCIPAL (-2) - -/* Macros for row and column status update and checking. */ -#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } - -/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */ -#define PUBLIC -#define PRIVATE static - -/* ========================================================================== */ -/* === Prototypes of PRIVATE routines ======================================= */ -/* ========================================================================== */ - -PRIVATE int init_rows_cols -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int p [] -) ; - -PRIVATE void init_scoring -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int head [], - double knobs [COLAMD_KNOBS], - int *p_n_row2, - int *p_n_col2, - int *p_max_deg -) ; - -PRIVATE int find_ordering -( - int n_row, - int n_col, - int Alen, - RowInfo Row [], - ColInfo Col [], - int A [], - int head [], - int n_col2, - int max_deg, - int pfree -) ; - -PRIVATE void order_children -( - int n_col, - ColInfo Col [], - int p [] -) ; - -PRIVATE void detect_super_cols -( -#ifndef NDEBUG - int n_col, - RowInfo Row [], -#endif - ColInfo Col [], - int A [], - int head [], - int row_start, - int row_length -) ; - -PRIVATE int garbage_collection -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int *pfree -) ; - -PRIVATE int clear_mark -( - int n_row, - RowInfo Row [] -) ; - -/* ========================================================================== */ -/* === Debugging definitions ================================================ */ -/* ========================================================================== */ - -#ifndef NDEBUG - -/* === With debugging ======================================================= */ - -/* stdlib.h: for getenv and atoi, to get debugging level from environment */ -#include - -/* stdio.h: for printf (no printing if debugging is turned off) */ -#include - -PRIVATE void debug_deg_lists -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int head [], - int min_score, - int should, - int max_deg -) ; - -PRIVATE void debug_mark -( - int n_row, - RowInfo Row [], - int tag_mark, - int max_mark -) ; - -PRIVATE void debug_matrix -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [] -) ; - -PRIVATE void debug_structures -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int n_col2 -) ; - -/* the following is the *ONLY* global variable in this file, and is only */ -/* present when debugging */ - -PRIVATE int debug_colamd ; /* debug print level */ - -#define DEBUG0(params) { (void) printf params ; } -#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; } -#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; } -#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; } -#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; } - -#else - -/* === No debugging ========================================================= */ - -#define DEBUG0(params) ; -#define DEBUG1(params) ; -#define DEBUG2(params) ; -#define DEBUG3(params) ; -#define DEBUG4(params) ; - -#endif - -/* ========================================================================== */ - - -/* ========================================================================== */ -/* === USER-CALLABLE ROUTINES: ============================================== */ -/* ========================================================================== */ - - -/* ========================================================================== */ -/* === colamd_recommended =================================================== */ -/* ========================================================================== */ - -/* - The colamd_recommended routine returns the suggested size for Alen. This - value has been determined to provide good balance between the number of - garbage collections and the memory requirements for colamd. -*/ - -PUBLIC int colamd_recommended /* returns recommended value of Alen. */ -( - /* === Parameters ======================================================= */ - - int nnz, /* number of nonzeros in A */ - int n_row, /* number of rows in A */ - int n_col /* number of columns in A */ -) -{ - /* === Local variables ================================================== */ - - int minimum ; /* bare minimum requirements */ - int recommended ; /* recommended value of Alen */ - - if (nnz < 0 || n_row < 0 || n_col < 0) - { - /* return -1 if any input argument is corrupted */ - DEBUG0 (("colamd_recommended error!")) ; - DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ; - return (-1) ; - } - - minimum = - 2 * (nnz) /* for A */ - + (((n_col) + 1) * sizeof (ColInfo) / sizeof (int)) /* for Col */ - + (((n_row) + 1) * sizeof (RowInfo) / sizeof (int)) /* for Row */ - + n_col /* minimum elbow room to guarrantee success */ - + COLAMD_STATS ; /* for output statistics */ - - /* recommended is equal to the minumum plus enough memory to keep the */ - /* number garbage collections low */ - recommended = minimum + nnz/5 ; - - return (recommended) ; -} - - -/* ========================================================================== */ -/* === colamd_set_defaults ================================================== */ -/* ========================================================================== */ - -/* - The colamd_set_defaults routine sets the default values of the user- - controllable parameters for colamd: - - knobs [0] rows with knobs[0]*n_col entries or more are removed - prior to ordering. - - knobs [1] columns with knobs[1]*n_row entries or more are removed - prior to ordering, and placed last in the column - permutation. - - knobs [2..19] unused, but future versions might use this -*/ - -PUBLIC void colamd_set_defaults -( - /* === Parameters ======================================================= */ - - double knobs [COLAMD_KNOBS] /* knob array */ -) -{ - /* === Local variables ================================================== */ - - int i ; - - if (!knobs) - { - return ; /* no knobs to initialize */ - } - for (i = 0 ; i < COLAMD_KNOBS ; i++) - { - knobs [i] = 0 ; - } - knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ - knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ -} - - -/* ========================================================================== */ -/* === colamd =============================================================== */ -/* ========================================================================== */ - -/* - The colamd routine computes a column ordering Q of a sparse matrix - A such that the LU factorization P(AQ) = LU remains sparse, where P is - selected via partial pivoting. The routine can also be viewed as - providing a permutation Q such that the Cholesky factorization - (AQ)'(AQ) = LL' remains sparse. - - On input, the nonzero patterns of the columns of A are stored in the - array A, in order 0 to n_col-1. A is held in 0-based form (rows in the - range 0 to n_row-1 and columns in the range 0 to n_col-1). Row indices - for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0, - and thus p [n_col] is the number of entries in A. The matrix is - destroyed on output. The row indices within each column do not have to - be sorted (from small to large row indices), and duplicate row indices - may be present. However, colamd will work a little faster if columns are - sorted and no duplicates are present. Matlab 5.2 always passes the matrix - with sorted columns, and no duplicates. - - The integer array A is of size Alen. Alen must be at least of size - (where nnz is the number of entries in A): - - nnz for the input column form of A - + nnz for a row form of A that colamd generates - + 6*(n_col+1) for a ColInfo Col [0..n_col] array - (this assumes sizeof (ColInfo) is 6 int's). - + 4*(n_row+1) for a RowInfo Row [0..n_row] array - (this assumes sizeof (RowInfo) is 4 int's). - + elbow_room must be at least n_col. We recommend at least - nnz/5 in addition to that. If sufficient, - changes in the elbow room affect the ordering - time only, not the ordering itself. - + COLAMD_STATS for the output statistics - - Colamd returns FALSE is memory is insufficient, or TRUE otherwise. - - On input, the caller must specify: - - n_row the number of rows of A - n_col the number of columns of A - Alen the size of the array A - A [0 ... nnz-1] the row indices, where nnz = p [n_col] - A [nnz ... Alen-1] (need not be initialized by the user) - p [0 ... n_col] the column pointers, p [0] = 0, and p [n_col] - is the number of entries in A. Column c of A - is stored in A [p [c] ... p [c+1]-1]. - knobs [0 ... 19] a set of parameters that control the behavior - of colamd. If knobs is a NULL pointer the - defaults are used. The user-callable - colamd_set_defaults routine sets the default - parameters. See that routine for a description - of the user-controllable parameters. - - If the return value of Colamd is TRUE, then on output: - - p [0 ... n_col-1] the column permutation. p [0] is the first - column index, and p [n_col-1] is the last. - That is, p [k] = j means that column j of A - is the kth column of AQ. - - A is undefined on output (the matrix pattern is - destroyed), except for the following statistics: - - A [0] the number of dense (or empty) rows ignored - A [1] the number of dense (or empty) columms. These - are ordered last, in their natural order. - A [2] the number of garbage collections performed. - If this is excessive, then you would have - gotten your results faster if Alen was larger. - A [3] 0, if all row indices in each column were in - sorted order and no duplicates were present. - 1, if there were unsorted or duplicate row - indices in the input. You would have gotten - your results faster if A [3] was returned as 0. - - If the return value of Colamd is FALSE, then A and p are undefined on - output. -*/ - -PUBLIC int colamd /* returns TRUE if successful */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows in A */ - int n_col, /* number of columns in A */ - int Alen, /* length of A */ - int A [], /* row indices of A */ - int p [], /* pointers to columns in A */ - double knobs [COLAMD_KNOBS] /* parameters (uses defaults if NULL) */ -) -{ - /* === Local variables ================================================== */ - - int i ; /* loop index */ - int nnz ; /* nonzeros in A */ - int Row_size ; /* size of Row [], in integers */ - int Col_size ; /* size of Col [], in integers */ - int elbow_room ; /* remaining free space */ - RowInfo *Row ; /* pointer into A of Row [0..n_row] array */ - ColInfo *Col ; /* pointer into A of Col [0..n_col] array */ - int n_col2 ; /* number of non-dense, non-empty columns */ - int n_row2 ; /* number of non-dense, non-empty rows */ - int ngarbage ; /* number of garbage collections performed */ - int max_deg ; /* maximum row degree */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs knobs array */ - int init_result ; /* return code from initialization */ - -#ifndef NDEBUG - debug_colamd = 0 ; /* no debug printing */ - /* get "D" environment variable, which gives the debug printing level */ - if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ; - DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ; -#endif - - /* === Check the input arguments ======================================== */ - - if (n_row < 0 || n_col < 0 || !A || !p) - { - /* n_row and n_col must be non-negative, A and p must be present */ - DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ; - return (FALSE) ; - } - nnz = p [n_col] ; - if (nnz < 0 || p [0] != 0) - { - /* nnz must be non-negative, and p [0] must be zero */ - DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ; - return (FALSE) ; - } - - /* === If no knobs, set default parameters ============================== */ - - if (!knobs) - { - knobs = default_knobs ; - colamd_set_defaults (knobs) ; - } - - /* === Allocate the Row and Col arrays from array A ===================== */ - - Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ; - Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ; - elbow_room = Alen - (2*nnz + Col_size + Row_size) ; - if (elbow_room < n_col + COLAMD_STATS) - { - /* not enough space in array A to perform the ordering */ - DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ; - return (FALSE) ; - } - Alen = 2*nnz + elbow_room ; - Col = (ColInfo *) &A [Alen] ; - Row = (RowInfo *) &A [Alen + Col_size] ; - - /* === Construct the row and column data structures ===================== */ - - init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ; - if (init_result == -1) - { - /* input matrix is invalid */ - DEBUG0 (("colamd error! matrix invalid\n")) ; - return (FALSE) ; - } - - /* === Initialize scores, kill dense rows/columns ======================= */ - - init_scoring (n_row, n_col, Row, Col, A, p, knobs, - &n_row2, &n_col2, &max_deg) ; - - /* === Order the supercolumns =========================================== */ - - ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p, - n_col2, max_deg, 2*nnz) ; - - /* === Order the non-principal columns ================================== */ - - order_children (n_col, Col, p) ; - - /* === Return statistics in A =========================================== */ - - for (i = 0 ; i < COLAMD_STATS ; i++) - { - A [i] = 0 ; - } - A [COLAMD_DENSE_ROW] = n_row - n_row2 ; - A [COLAMD_DENSE_COL] = n_col - n_col2 ; - A [COLAMD_DEFRAG_COUNT] = ngarbage ; - A [COLAMD_JUMBLED_COLS] = init_result ; - - return (TRUE) ; -} - - -/* ========================================================================== */ -/* === NON-USER-CALLABLE ROUTINES: ========================================== */ -/* ========================================================================== */ - -/* There are no user-callable routines beyond this point in the file */ - - -/* ========================================================================== */ -/* === init_rows_cols ======================================================= */ -/* ========================================================================== */ - -/* - Takes the column form of the matrix in A and creates the row form of the - matrix. Also, row and column attributes are stored in the Col and Row - structs. If the columns are un-sorted or contain duplicate row indices, - this routine will also sort and remove duplicate row indices from the - column form of the matrix. Returns -1 on error, 1 if columns jumbled, - or 0 if columns not jumbled. Not user-callable. -*/ - -PRIVATE int init_rows_cols /* returns status code */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - RowInfo Row [], /* of size n_row+1 */ - ColInfo Col [], /* of size n_col+1 */ - int A [], /* row indices of A, of size Alen */ - int p [] /* pointers to columns in A, of size n_col+1 */ -) -{ - /* === Local variables ================================================== */ - - int col ; /* a column index */ - int row ; /* a row index */ - int *cp ; /* a column pointer */ - int *cp_end ; /* a pointer to the end of a column */ - int *rp ; /* a row pointer */ - int *rp_end ; /* a pointer to the end of a row */ - int last_start ; /* start index of previous column in A */ - int start ; /* start index of column in A */ - int last_row ; /* previous row */ - int jumbled_columns ; /* indicates if columns are jumbled */ - - /* === Initialize columns, and check column pointers ==================== */ - - last_start = 0 ; - for (col = 0 ; col < n_col ; col++) - { - start = p [col] ; - if (start < last_start) - { - /* column pointers must be non-decreasing */ - DEBUG0 (("colamd error! last p %d p [col] %d\n",last_start,start)); - return (-1) ; - } - Col [col].start = start ; - Col [col].length = p [col+1] - start ; - Col [col].shared1.thickness = 1 ; - Col [col].shared2.score = 0 ; - Col [col].shared3.prev = EMPTY ; - Col [col].shared4.degree_next = EMPTY ; - last_start = start ; - } - /* must check the end pointer for last column */ - if (p [n_col] < last_start) - { - /* column pointers must be non-decreasing */ - DEBUG0 (("colamd error! last p %d p [n_col] %d\n",p[col],last_start)) ; - return (-1) ; - } - - /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ - - /* === Scan columns, compute row degrees, and check row indices ========= */ - - jumbled_columns = FALSE ; - - for (row = 0 ; row < n_row ; row++) - { - Row [row].length = 0 ; - Row [row].shared2.mark = -1 ; - } - - for (col = 0 ; col < n_col ; col++) - { - last_row = -1 ; - - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - - while (cp < cp_end) - { - row = *cp++ ; - - /* make sure row indices within range */ - if (row < 0 || row >= n_row) - { - DEBUG0 (("colamd error! col %d row %d last_row %d\n", - col, row, last_row)) ; - return (-1) ; - } - else if (row <= last_row) - { - /* row indices are not sorted or repeated, thus cols */ - /* are jumbled */ - jumbled_columns = TRUE ; - } - /* prevent repeated row from being counted */ - if (Row [row].shared2.mark != col) - { - Row [row].length++ ; - Row [row].shared2.mark = col ; - last_row = row ; - } - else - { - /* this is a repeated entry in the column, */ - /* it will be removed */ - Col [col].length-- ; - } - } - } - - /* === Compute row pointers ============================================= */ - - /* row form of the matrix starts directly after the column */ - /* form of matrix in A */ - Row [0].start = p [n_col] ; - Row [0].shared1.p = Row [0].start ; - Row [0].shared2.mark = -1 ; - for (row = 1 ; row < n_row ; row++) - { - Row [row].start = Row [row-1].start + Row [row-1].length ; - Row [row].shared1.p = Row [row].start ; - Row [row].shared2.mark = -1 ; - } - - /* === Create row form ================================================== */ - - if (jumbled_columns) - { - /* if cols jumbled, watch for repeated row indices */ - for (col = 0 ; col < n_col ; col++) - { - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - while (cp < cp_end) - { - row = *cp++ ; - if (Row [row].shared2.mark != col) - { - A [(Row [row].shared1.p)++] = col ; - Row [row].shared2.mark = col ; - } - } - } - } - else - { - /* if cols not jumbled, we don't need the mark (this is faster) */ - for (col = 0 ; col < n_col ; col++) - { - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - while (cp < cp_end) - { - A [(Row [*cp++].shared1.p)++] = col ; - } - } - } - - /* === Clear the row marks and set row degrees ========================== */ - - for (row = 0 ; row < n_row ; row++) - { - Row [row].shared2.mark = 0 ; - Row [row].shared1.degree = Row [row].length ; - } - - /* === See if we need to re-create columns ============================== */ - - if (jumbled_columns) - { - -#ifndef NDEBUG - /* make sure column lengths are correct */ - for (col = 0 ; col < n_col ; col++) - { - p [col] = Col [col].length ; - } - for (row = 0 ; row < n_row ; row++) - { - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - p [*rp++]-- ; - } - } - for (col = 0 ; col < n_col ; col++) - { - assert (p [col] == 0) ; - } - /* now p is all zero (different than when debugging is turned off) */ -#endif - - /* === Compute col pointers ========================================= */ - - /* col form of the matrix starts at A [0]. */ - /* Note, we may have a gap between the col form and the row */ - /* form if there were duplicate entries, if so, it will be */ - /* removed upon the first garbage collection */ - Col [0].start = 0 ; - p [0] = Col [0].start ; - for (col = 1 ; col < n_col ; col++) - { - /* note that the lengths here are for pruned columns, i.e. */ - /* no duplicate row indices will exist for these columns */ - Col [col].start = Col [col-1].start + Col [col-1].length ; - p [col] = Col [col].start ; - } - - /* === Re-create col form =========================================== */ - - for (row = 0 ; row < n_row ; row++) - { - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - A [(p [*rp++])++] = row ; - } - } - return (1) ; - } - else - { - /* no columns jumbled (this is faster) */ - return (0) ; - } -} - - -/* ========================================================================== */ -/* === init_scoring ========================================================= */ -/* ========================================================================== */ - -/* - Kills dense or empty columns and rows, calculates an initial score for - each column, and places all columns in the degree lists. Not user-callable. -*/ - -PRIVATE void init_scoring -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - RowInfo Row [], /* of size n_row+1 */ - ColInfo Col [], /* of size n_col+1 */ - int A [], /* column form and row form of A */ - int head [], /* of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameters */ - int *p_n_row2, /* number of non-dense, non-empty rows */ - int *p_n_col2, /* number of non-dense, non-empty columns */ - int *p_max_deg /* maximum row degree */ -) -{ - /* === Local variables ================================================== */ - - int c ; /* a column index */ - int r, row ; /* a row index */ - int *cp ; /* a column pointer */ - int deg ; /* degree (# entries) of a row or column */ - int *cp_end ; /* a pointer to the end of a column */ - int *new_cp ; /* new column pointer */ - int col_length ; /* length of pruned column */ - int score ; /* current column score */ - int n_col2 ; /* number of non-dense, non-empty columns */ - int n_row2 ; /* number of non-dense, non-empty rows */ - int dense_row_count ; /* remove rows with more entries than this */ - int dense_col_count ; /* remove cols with more entries than this */ - int min_score ; /* smallest column score */ - int max_deg ; /* maximum row degree */ - int next_col ; /* Used to add to degree list.*/ -#ifndef NDEBUG - int debug_count ; /* debug only. */ -#endif - - /* === Extract knobs ==================================================== */ - - dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; - dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; - DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ; - max_deg = 0 ; - n_col2 = n_col ; - n_row2 = n_row ; - - /* === Kill empty columns =============================================== */ - - /* Put the empty columns at the end in their natural, so that LU */ - /* factorization can proceed as far as possible. */ - for (c = n_col-1 ; c >= 0 ; c--) - { - deg = Col [c].length ; - if (deg == 0) - { - /* this is a empty column, kill and order it last */ - Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; - } - } - DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ; - - /* === Kill dense columns =============================================== */ - - /* Put the dense columns at the end, in their natural order */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* skip any dead columns */ - if (COL_IS_DEAD (c)) - { - continue ; - } - deg = Col [c].length ; - if (deg > dense_col_count) - { - /* this is a dense column, kill and order it last */ - Col [c].shared2.order = --n_col2 ; - /* decrement the row degrees */ - cp = &A [Col [c].start] ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - Row [*cp++].shared1.degree-- ; - } - KILL_PRINCIPAL_COL (c) ; - } - } - DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ; - - /* === Kill dense and empty rows ======================================== */ - - for (r = 0 ; r < n_row ; r++) - { - deg = Row [r].shared1.degree ; - assert (deg >= 0 && deg <= n_col) ; - if (deg > dense_row_count || deg == 0) - { - /* kill a dense or empty row */ - KILL_ROW (r) ; - --n_row2 ; - } - else - { - /* keep track of max degree of remaining rows */ - max_deg = MAX (max_deg, deg) ; - } - } - DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ; - - /* === Compute initial column scores ==================================== */ - - /* At this point the row degrees are accurate. They reflect the number */ - /* of "live" (non-dense) columns in each row. No empty rows exist. */ - /* Some "live" columns may contain only dead rows, however. These are */ - /* pruned in the code below. */ - - /* now find the initial matlab score for each column */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* skip dead column */ - if (COL_IS_DEAD (c)) - { - continue ; - } - score = 0 ; - cp = &A [Col [c].start] ; - new_cp = cp ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - /* skip if dead */ - if (ROW_IS_DEAD (row)) - { - continue ; - } - /* compact the column */ - *new_cp++ = row ; - /* add row's external degree */ - score += Row [row].shared1.degree - 1 ; - /* guard against integer overflow */ - score = MIN (score, n_col) ; - } - /* determine pruned column length */ - col_length = (int) (new_cp - &A [Col [c].start]) ; - if (col_length == 0) - { - /* a newly-made null column (all rows in this col are "dense" */ - /* and have already been killed) */ - DEBUG0 (("Newly null killed: %d\n", c)) ; - Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; - } - else - { - /* set column length and set score */ - assert (score >= 0) ; - assert (score <= n_col) ; - Col [c].length = col_length ; - Col [c].shared2.score = score ; - } - } - DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ; - - /* At this point, all empty rows and columns are dead. All live columns */ - /* are "clean" (containing no dead rows) and simplicial (no supercolumns */ - /* yet). Rows may contain dead columns, but all live rows contain at */ - /* least one live column. */ - -#ifndef NDEBUG - debug_structures (n_row, n_col, Row, Col, A, n_col2) ; -#endif - - /* === Initialize degree lists ========================================== */ - -#ifndef NDEBUG - debug_count = 0 ; -#endif - - /* clear the hash buckets */ - for (c = 0 ; c <= n_col ; c++) - { - head [c] = EMPTY ; - } - min_score = n_col ; - /* place in reverse order, so low column indices are at the front */ - /* of the lists. This is to encourage natural tie-breaking */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* only add principal columns to degree lists */ - if (COL_IS_ALIVE (c)) - { - DEBUG4 (("place %d score %d minscore %d ncol %d\n", - c, Col [c].shared2.score, min_score, n_col)) ; - - /* === Add columns score to DList =============================== */ - - score = Col [c].shared2.score ; - - assert (min_score >= 0) ; - assert (min_score <= n_col) ; - assert (score >= 0) ; - assert (score <= n_col) ; - assert (head [score] >= EMPTY) ; - - /* now add this column to dList at proper score location */ - next_col = head [score] ; - Col [c].shared3.prev = EMPTY ; - Col [c].shared4.degree_next = next_col ; - - /* if there already was a column with the same score, set its */ - /* previous pointer to this new column */ - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = c ; - } - head [score] = c ; - - /* see if this score is less than current min */ - min_score = MIN (min_score, score) ; - -#ifndef NDEBUG - debug_count++ ; -#endif - } - } - -#ifndef NDEBUG - DEBUG0 (("Live cols %d out of %d, non-princ: %d\n", - debug_count, n_col, n_col-debug_count)) ; - assert (debug_count == n_col2) ; - debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ; -#endif - - /* === Return number of remaining columns, and max row degree =========== */ - - *p_n_col2 = n_col2 ; - *p_n_row2 = n_row2 ; - *p_max_deg = max_deg ; -} - - -/* ========================================================================== */ -/* === find_ordering ======================================================== */ -/* ========================================================================== */ - -/* - Order the principal columns of the supercolumn form of the matrix - (no supercolumns on input). Uses a minimum approximate column minimum - degree ordering method. Not user-callable. -*/ - -PRIVATE int find_ordering /* return the number of garbage collections */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - int Alen, /* size of A, 2*nnz + elbow_room or larger */ - RowInfo Row [], /* of size n_row+1 */ - ColInfo Col [], /* of size n_col+1 */ - int A [], /* column form and row form of A */ - int head [], /* of size n_col+1 */ - int n_col2, /* Remaining columns to order */ - int max_deg, /* Maximum row degree */ - int pfree /* index of first free slot (2*nnz on entry) */ -) -{ - /* === Local variables ================================================== */ - - int k ; /* current pivot ordering step */ - int pivot_col ; /* current pivot column */ - int *cp ; /* a column pointer */ - int *rp ; /* a row pointer */ - int pivot_row ; /* current pivot row */ - int *new_cp ; /* modified column pointer */ - int *new_rp ; /* modified row pointer */ - int pivot_row_start ; /* pointer to start of pivot row */ - int pivot_row_degree ; /* # of columns in pivot row */ - int pivot_row_length ; /* # of supercolumns in pivot row */ - int pivot_col_score ; /* score of pivot column */ - int needed_memory ; /* free space needed for pivot row */ - int *cp_end ; /* pointer to the end of a column */ - int *rp_end ; /* pointer to the end of a row */ - int row ; /* a row index */ - int col ; /* a column index */ - int max_score ; /* maximum possible score */ - int cur_score ; /* score of current column */ - unsigned int hash ; /* hash value for supernode detection */ - int head_column ; /* head of hash bucket */ - int first_col ; /* first column in hash bucket */ - int tag_mark ; /* marker value for mark array */ - int row_mark ; /* Row [row].shared2.mark */ - int set_difference ; /* set difference size of row with pivot row */ - int min_score ; /* smallest column score */ - int col_thickness ; /* "thickness" (# of columns in a supercol) */ - int max_mark ; /* maximum value of tag_mark */ - int pivot_col_thickness ; /* number of columns represented by pivot col */ - int prev_col ; /* Used by Dlist operations. */ - int next_col ; /* Used by Dlist operations. */ - int ngarbage ; /* number of garbage collections performed */ -#ifndef NDEBUG - int debug_d ; /* debug loop counter */ - int debug_step = 0 ; /* debug loop counter */ -#endif - - /* === Initialization and clear mark ==================================== */ - - max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = clear_mark (n_row, Row) ; - min_score = 0 ; - ngarbage = 0 ; - DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ; - - /* === Order the columns ================================================ */ - - for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */) - { - -#ifndef NDEBUG - if (debug_step % 100 == 0) - { - DEBUG0 (("\n... Step k: %d out of n_col2: %d\n", k, n_col2)) ; - } - else - { - DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ; - } - debug_step++ ; - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k, max_deg) ; - debug_matrix (n_row, n_col, Row, Col, A) ; -#endif - - /* === Select pivot column, and order it ============================ */ - - /* make sure degree list isn't empty */ - assert (min_score >= 0) ; - assert (min_score <= n_col) ; - assert (head [min_score] >= EMPTY) ; - -#ifndef NDEBUG - for (debug_d = 0 ; debug_d < min_score ; debug_d++) - { - assert (head [debug_d] == EMPTY) ; - } -#endif - - /* get pivot column from head of minimum degree list */ - while (head [min_score] == EMPTY && min_score < n_col) - { - min_score++ ; - } - pivot_col = head [min_score] ; - assert (pivot_col >= 0 && pivot_col <= n_col) ; - next_col = Col [pivot_col].shared4.degree_next ; - head [min_score] = next_col ; - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = EMPTY ; - } - - assert (COL_IS_ALIVE (pivot_col)) ; - DEBUG3 (("Pivot col: %d\n", pivot_col)) ; - - /* remember score for defrag check */ - pivot_col_score = Col [pivot_col].shared2.score ; - - /* the pivot column is the kth column in the pivot order */ - Col [pivot_col].shared2.order = k ; - - /* increment order count by column thickness */ - pivot_col_thickness = Col [pivot_col].shared1.thickness ; - k += pivot_col_thickness ; - assert (pivot_col_thickness > 0) ; - - /* === Garbage_collection, if necessary ============================= */ - - needed_memory = MIN (pivot_col_score, n_col - k) ; - if (pfree + needed_memory >= Alen) - { - pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; - ngarbage++ ; - /* after garbage collection we will have enough */ - assert (pfree + needed_memory < Alen) ; - /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = clear_mark (n_row, Row) ; -#ifndef NDEBUG - debug_matrix (n_row, n_col, Row, Col, A) ; -#endif - } - - /* === Compute pivot row pattern ==================================== */ - - /* get starting location for this new merged row */ - pivot_row_start = pfree ; - - /* initialize new row counts to zero */ - pivot_row_degree = 0 ; - - /* tag pivot column as having been visited so it isn't included */ - /* in merged pivot row */ - Col [pivot_col].shared1.thickness = -pivot_col_thickness ; - - /* pivot row is the union of all rows in the pivot column pattern */ - cp = &A [Col [pivot_col].start] ; - cp_end = cp + Col [pivot_col].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; - /* skip if row is dead */ - if (ROW_IS_DEAD (row)) - { - continue ; - } - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - /* get a column */ - col = *rp++ ; - /* add the column, if alive and untagged */ - col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && COL_IS_ALIVE (col)) - { - /* tag column in pivot row */ - Col [col].shared1.thickness = -col_thickness ; - assert (pfree < Alen) ; - /* place column in pivot row */ - A [pfree++] = col ; - pivot_row_degree += col_thickness ; - } - } - } - - /* clear tag on pivot column */ - Col [pivot_col].shared1.thickness = pivot_col_thickness ; - max_deg = MAX (max_deg, pivot_row_degree) ; - -#ifndef NDEBUG - DEBUG3 (("check2\n")) ; - debug_mark (n_row, Row, tag_mark, max_mark) ; -#endif - - /* === Kill all rows used to construct pivot row ==================== */ - - /* also kill pivot row, temporarily */ - cp = &A [Col [pivot_col].start] ; - cp_end = cp + Col [pivot_col].length ; - while (cp < cp_end) - { - /* may be killing an already dead row */ - row = *cp++ ; - DEBUG2 (("Kill row in pivot col: %d\n", row)) ; - KILL_ROW (row) ; - } - - /* === Select a row index to use as the new pivot row =============== */ - - pivot_row_length = pfree - pivot_row_start ; - if (pivot_row_length > 0) - { - /* pick the "pivot" row arbitrarily (first row in col) */ - pivot_row = A [Col [pivot_col].start] ; - DEBUG2 (("Pivotal row is %d\n", pivot_row)) ; - } - else - { - /* there is no pivot row, since it is of zero length */ - pivot_row = EMPTY ; - assert (pivot_row_length == 0) ; - } - assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ; - - /* === Approximate degree computation =============================== */ - - /* Here begins the computation of the approximate degree. The column */ - /* score is the sum of the pivot row "length", plus the size of the */ - /* set differences of each row in the column minus the pattern of the */ - /* pivot row itself. The column ("thickness") itself is also */ - /* excluded from the column score (we thus use an approximate */ - /* external degree). */ - - /* The time taken by the following code (compute set differences, and */ - /* add them up) is proportional to the size of the data structure */ - /* being scanned - that is, the sum of the sizes of each column in */ - /* the pivot row. Thus, the amortized time to compute a column score */ - /* is proportional to the size of that column (where size, in this */ - /* context, is the column "length", or the number of row indices */ - /* in that column). The number of row indices in a column is */ - /* monotonically non-decreasing, from the length of the original */ - /* column on input to colamd. */ - - /* === Compute set differences ====================================== */ - - DEBUG1 (("** Computing set differences phase. **\n")) ; - - /* pivot row is currently dead - it will be revived later. */ - - DEBUG2 (("Pivot row: ")) ; - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - col = *rp++ ; - assert (COL_IS_ALIVE (col) && col != pivot_col) ; - DEBUG2 (("Col: %d\n", col)) ; - - /* clear tags used to construct pivot row pattern */ - col_thickness = -Col [col].shared1.thickness ; - assert (col_thickness > 0) ; - Col [col].shared1.thickness = col_thickness ; - - /* === Remove column from degree list =========================== */ - - cur_score = Col [col].shared2.score ; - prev_col = Col [col].shared3.prev ; - next_col = Col [col].shared4.degree_next ; - assert (cur_score >= 0) ; - assert (cur_score <= n_col) ; - assert (cur_score >= EMPTY) ; - if (prev_col == EMPTY) - { - head [cur_score] = next_col ; - } - else - { - Col [prev_col].shared4.degree_next = next_col ; - } - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = prev_col ; - } - - /* === Scan the column ========================================== */ - - cp = &A [Col [col].start] ; - cp_end = cp + Col [col].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - row_mark = Row [row].shared2.mark ; - /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) - { - continue ; - } - assert (row != pivot_row) ; - set_difference = row_mark - tag_mark ; - /* check if the row has been seen yet */ - if (set_difference < 0) - { - assert (Row [row].shared1.degree <= max_deg) ; - set_difference = Row [row].shared1.degree ; - } - /* subtract column thickness from this row's set difference */ - set_difference -= col_thickness ; - assert (set_difference >= 0) ; - /* absorb this row if the set difference becomes zero */ - if (set_difference == 0) - { - DEBUG1 (("aggressive absorption. Row: %d\n", row)) ; - KILL_ROW (row) ; - } - else - { - /* save the new mark */ - Row [row].shared2.mark = set_difference + tag_mark ; - } - } - } - -#ifndef NDEBUG - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k-pivot_row_degree, max_deg) ; -#endif - - /* === Add up set differences for each column ======================= */ - - DEBUG1 (("** Adding set differences phase. **\n")) ; - - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - /* get a column */ - col = *rp++ ; - assert (COL_IS_ALIVE (col) && col != pivot_col) ; - hash = 0 ; - cur_score = 0 ; - cp = &A [Col [col].start] ; - /* compact the column */ - new_cp = cp ; - cp_end = cp + Col [col].length ; - - DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ; - - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - assert(row >= 0 && row < n_row) ; - row_mark = Row [row].shared2.mark ; - /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) - { - continue ; - } - assert (row_mark > tag_mark) ; - /* compact the column */ - *new_cp++ = row ; - /* compute hash function */ - hash += row ; - /* add set difference */ - cur_score += row_mark - tag_mark ; - /* integer overflow... */ - cur_score = MIN (cur_score, n_col) ; - } - - /* recompute the column's length */ - Col [col].length = (int) (new_cp - &A [Col [col].start]) ; - - /* === Further mass elimination ================================= */ - - if (Col [col].length == 0) - { - DEBUG1 (("further mass elimination. Col: %d\n", col)) ; - /* nothing left but the pivot row in this column */ - KILL_PRINCIPAL_COL (col) ; - pivot_row_degree -= Col [col].shared1.thickness ; - assert (pivot_row_degree >= 0) ; - /* order it */ - Col [col].shared2.order = k ; - /* increment order count by column thickness */ - k += Col [col].shared1.thickness ; - } - else - { - /* === Prepare for supercolumn detection ==================== */ - - DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ; - - /* save score so far */ - Col [col].shared2.score = cur_score ; - - /* add column to hash table, for supercolumn detection */ - hash %= n_col + 1 ; - - DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ; - assert (hash <= n_col) ; - - head_column = head [hash] ; - if (head_column > EMPTY) - { - /* degree list "hash" is non-empty, use prev (shared3) of */ - /* first column in degree list as head of hash bucket */ - first_col = Col [head_column].shared3.headhash ; - Col [head_column].shared3.headhash = col ; - } - else - { - /* degree list "hash" is empty, use head as hash bucket */ - first_col = - (head_column + 2) ; - head [hash] = - (col + 2) ; - } - Col [col].shared4.hash_next = first_col ; - - /* save hash function in Col [col].shared3.hash */ - Col [col].shared3.hash = (int) hash ; - assert (COL_IS_ALIVE (col)) ; - } - } - - /* The approximate external column degree is now computed. */ - - /* === Supercolumn detection ======================================== */ - - DEBUG1 (("** Supercolumn detection phase. **\n")) ; - - detect_super_cols ( -#ifndef NDEBUG - n_col, Row, -#endif - Col, A, head, pivot_row_start, pivot_row_length) ; - - /* === Kill the pivotal column ====================================== */ - - KILL_PRINCIPAL_COL (pivot_col) ; - - /* === Clear mark =================================================== */ - - tag_mark += (max_deg + 1) ; - if (tag_mark >= max_mark) - { - DEBUG1 (("clearing tag_mark\n")) ; - tag_mark = clear_mark (n_row, Row) ; - } -#ifndef NDEBUG - DEBUG3 (("check3\n")) ; - debug_mark (n_row, Row, tag_mark, max_mark) ; -#endif - - /* === Finalize the new pivot row, and column scores ================ */ - - DEBUG1 (("** Finalize scores phase. **\n")) ; - - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - /* compact the pivot row */ - new_rp = rp ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - col = *rp++ ; - /* skip dead columns */ - if (COL_IS_DEAD (col)) - { - continue ; - } - *new_rp++ = col ; - /* add new pivot row to column */ - A [Col [col].start + (Col [col].length++)] = pivot_row ; - - /* retrieve score so far and add on pivot row's degree. */ - /* (we wait until here for this in case the pivot */ - /* row's degree was reduced due to mass elimination). */ - cur_score = Col [col].shared2.score + pivot_row_degree ; - - /* calculate the max possible score as the number of */ - /* external columns minus the 'k' value minus the */ - /* columns thickness */ - max_score = n_col - k - Col [col].shared1.thickness ; - - /* make the score the external degree of the union-of-rows */ - cur_score -= Col [col].shared1.thickness ; - - /* make sure score is less or equal than the max score */ - cur_score = MIN (cur_score, max_score) ; - assert (cur_score >= 0) ; - - /* store updated score */ - Col [col].shared2.score = cur_score ; - - /* === Place column back in degree list ========================= */ - - assert (min_score >= 0) ; - assert (min_score <= n_col) ; - assert (cur_score >= 0) ; - assert (cur_score <= n_col) ; - assert (head [cur_score] >= EMPTY) ; - next_col = head [cur_score] ; - Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = EMPTY ; - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = col ; - } - head [cur_score] = col ; - - /* see if this score is less than current min */ - min_score = MIN (min_score, cur_score) ; - - } - -#ifndef NDEBUG - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k, max_deg) ; -#endif - - /* === Resurrect the new pivot row ================================== */ - - if (pivot_row_degree > 0) - { - /* update pivot row length to reflect any cols that were killed */ - /* during super-col detection and mass elimination */ - Row [pivot_row].start = pivot_row_start ; - Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ; - Row [pivot_row].shared1.degree = pivot_row_degree ; - Row [pivot_row].shared2.mark = 0 ; - /* pivot row is no longer dead */ - } - } - - /* === All principal columns have now been ordered ====================== */ - - return (ngarbage) ; -} - - -/* ========================================================================== */ -/* === order_children ======================================================= */ -/* ========================================================================== */ - -/* - The find_ordering routine has ordered all of the principal columns (the - representatives of the supercolumns). The non-principal columns have not - yet been ordered. This routine orders those columns by walking up the - parent tree (a column is a child of the column which absorbed it). The - final permutation vector is then placed in p [0 ... n_col-1], with p [0] - being the first column, and p [n_col-1] being the last. It doesn't look - like it at first glance, but be assured that this routine takes time linear - in the number of columns. Although not immediately obvious, the time - taken by this routine is O (n_col), that is, linear in the number of - columns. Not user-callable. -*/ - -PRIVATE void order_children -( - /* === Parameters ======================================================= */ - - int n_col, /* number of columns of A */ - ColInfo Col [], /* of size n_col+1 */ - int p [] /* p [0 ... n_col-1] is the column permutation*/ -) -{ - /* === Local variables ================================================== */ - - int i ; /* loop counter for all columns */ - int c ; /* column index */ - int parent ; /* index of column's parent */ - int order ; /* column's order */ - - /* === Order each non-principal column ================================== */ - - for (i = 0 ; i < n_col ; i++) - { - /* find an un-ordered non-principal column */ - assert (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY) - { - parent = i ; - /* once found, find its principal parent */ - do - { - parent = Col [parent].shared1.parent ; - } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; - - /* now, order all un-ordered non-principal columns along path */ - /* to this parent. collapse tree at the same time */ - c = i ; - /* get order of parent */ - order = Col [parent].shared2.order ; - - do - { - assert (Col [c].shared2.order == EMPTY) ; - - /* order this column */ - Col [c].shared2.order = order++ ; - /* collaps tree */ - Col [c].shared1.parent = parent ; - - /* get immediate parent of this column */ - c = Col [c].shared1.parent ; - - /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ - /* above an ordered column */ - } while (Col [c].shared2.order == EMPTY) ; - - /* re-order the super_col parent to largest order for this group */ - Col [parent].shared2.order = order ; - } - } - - /* === Generate the permutation ========================================= */ - - for (c = 0 ; c < n_col ; c++) - { - p [Col [c].shared2.order] = c ; - } -} - - -/* ========================================================================== */ -/* === detect_super_cols ==================================================== */ -/* ========================================================================== */ - -/* - Detects supercolumns by finding matches between columns in the hash buckets. - Check amongst columns in the set A [row_start ... row_start + row_length-1]. - The columns under consideration are currently *not* in the degree lists, - and have already been placed in the hash buckets. - - The hash bucket for columns whose hash function is equal to h is stored - as follows: - - if head [h] is >= 0, then head [h] contains a degree list, so: - - head [h] is the first column in degree bucket h. - Col [head [h]].headhash gives the first column in hash bucket h. - - otherwise, the degree list is empty, and: - - -(head [h] + 2) is the first column in hash bucket h. - - For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous - column" pointer. Col [c].shared3.hash is used instead as the hash number - for that column. The value of Col [c].shared4.hash_next is the next column - in the same hash bucket. - - Assuming no, or "few" hash collisions, the time taken by this routine is - linear in the sum of the sizes (lengths) of each column whose score has - just been computed in the approximate degree computation. - Not user-callable. -*/ - -PRIVATE void detect_super_cols -( - /* === Parameters ======================================================= */ - -#ifndef NDEBUG - /* these two parameters are only needed when debugging is enabled: */ - int n_col, /* number of columns of A */ - RowInfo Row [], /* of size n_row+1 */ -#endif - ColInfo Col [], /* of size n_col+1 */ - int A [], /* row indices of A */ - int head [], /* head of degree lists and hash buckets */ - int row_start, /* pointer to set of columns to check */ - int row_length /* number of columns to check */ -) -{ - /* === Local variables ================================================== */ - - int hash ; /* hash # for a column */ - int *rp ; /* pointer to a row */ - int c ; /* a column index */ - int super_c ; /* column index of the column to absorb into */ - int *cp1 ; /* column pointer for column super_c */ - int *cp2 ; /* column pointer for column c */ - int length ; /* length of column super_c */ - int prev_c ; /* column preceding c in hash bucket */ - int i ; /* loop counter */ - int *rp_end ; /* pointer to the end of the row */ - int col ; /* a column index in the row to check */ - int head_column ; /* first column in hash bucket or degree list */ - int first_col ; /* first column in hash bucket */ - - /* === Consider each column in the row ================================== */ - - rp = &A [row_start] ; - rp_end = rp + row_length ; - while (rp < rp_end) - { - col = *rp++ ; - if (COL_IS_DEAD (col)) - { - continue ; - } - - /* get hash number for this column */ - hash = Col [col].shared3.hash ; - assert (hash <= n_col) ; - - /* === Get the first column in this hash bucket ===================== */ - - head_column = head [hash] ; - if (head_column > EMPTY) - { - first_col = Col [head_column].shared3.headhash ; - } - else - { - first_col = - (head_column + 2) ; - } - - /* === Consider each column in the hash bucket ====================== */ - - for (super_c = first_col ; super_c != EMPTY ; - super_c = Col [super_c].shared4.hash_next) - { - assert (COL_IS_ALIVE (super_c)) ; - assert (Col [super_c].shared3.hash == hash) ; - length = Col [super_c].length ; - - /* prev_c is the column preceding column c in the hash bucket */ - prev_c = super_c ; - - /* === Compare super_c with all columns after it ================ */ - - for (c = Col [super_c].shared4.hash_next ; - c != EMPTY ; c = Col [c].shared4.hash_next) - { - assert (c != super_c) ; - assert (COL_IS_ALIVE (c)) ; - assert (Col [c].shared3.hash == hash) ; - - /* not identical if lengths or scores are different */ - if (Col [c].length != length || - Col [c].shared2.score != Col [super_c].shared2.score) - { - prev_c = c ; - continue ; - } - - /* compare the two columns */ - cp1 = &A [Col [super_c].start] ; - cp2 = &A [Col [c].start] ; - - for (i = 0 ; i < length ; i++) - { - /* the columns are "clean" (no dead rows) */ - assert (ROW_IS_ALIVE (*cp1)) ; - assert (ROW_IS_ALIVE (*cp2)) ; - /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ - if (*cp1++ != *cp2++) - { - break ; - } - } - - /* the two columns are different if the for-loop "broke" */ - if (i != length) - { - prev_c = c ; - continue ; - } - - /* === Got it! two columns are identical =================== */ - - assert (Col [c].shared2.score == Col [super_c].shared2.score) ; - - Col [super_c].shared1.thickness += Col [c].shared1.thickness ; - Col [c].shared1.parent = super_c ; - KILL_NON_PRINCIPAL_COL (c) ; - /* order c later, in order_children() */ - Col [c].shared2.order = EMPTY ; - /* remove c from hash bucket */ - Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; - } - } - - /* === Empty this hash bucket ======================================= */ - - if (head_column > EMPTY) - { - /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = EMPTY ; - } - else - { - /* corresponding degree list "hash" is empty */ - head [hash] = EMPTY ; - } - } -} - - -/* ========================================================================== */ -/* === garbage_collection =================================================== */ -/* ========================================================================== */ - -/* - Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns - the index of the first free position in A, after garbage collection. The - time taken by this routine is linear is the size of the array A, which is - itself linear in the number of nonzeros in the input matrix. - Not user-callable. -*/ - -PRIVATE int garbage_collection /* returns the new value of pfree */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows */ - int n_col, /* number of columns */ - RowInfo Row [], /* row info */ - ColInfo Col [], /* column info */ - int A [], /* A [0 ... Alen-1] holds the matrix */ - int *pfree /* &A [0] ... pfree is in use */ -) -{ - /* === Local variables ================================================== */ - - int *psrc ; /* source pointer */ - int *pdest ; /* destination pointer */ - int j ; /* counter */ - int r ; /* a row index */ - int c ; /* a column index */ - int length ; /* length of a row or column */ - -#ifndef NDEBUG - int debug_rows ; - DEBUG0 (("Defrag..\n")) ; - for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ; - debug_rows = 0 ; -#endif - - /* === Defragment the columns =========================================== */ - - pdest = &A[0] ; - for (c = 0 ; c < n_col ; c++) - { - if (COL_IS_ALIVE (c)) - { - psrc = &A [Col [c].start] ; - - /* move and compact the column */ - assert (pdest <= psrc) ; - Col [c].start = (int) (pdest - &A [0]) ; - length = Col [c].length ; - for (j = 0 ; j < length ; j++) - { - r = *psrc++ ; - if (ROW_IS_ALIVE (r)) - { - *pdest++ = r ; - } - } - Col [c].length = (int) (pdest - &A [Col [c].start]) ; - } - } - - /* === Prepare to defragment the rows =================================== */ - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - if (Row [r].length == 0) - { - /* this row is of zero length. cannot compact it, so kill it */ - DEBUG0 (("Defrag row kill\n")) ; - KILL_ROW (r) ; - } - else - { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - assert (ROW_IS_ALIVE (r)) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ONES_COMPLEMENT (r) ; -#ifndef NDEBUG - debug_rows++ ; -#endif - } - } - } - - /* === Defragment the rows ============================================== */ - - psrc = pdest ; - while (psrc < pfree) - { - /* find a negative number ... the start of a row */ - if (*psrc++ < 0) - { - psrc-- ; - /* get the row index */ - r = ONES_COMPLEMENT (*psrc) ; - assert (r >= 0 && r < n_row) ; - /* restore first column index */ - *psrc = Row [r].shared2.first_column ; - assert (ROW_IS_ALIVE (r)) ; - - /* move and compact the row */ - assert (pdest <= psrc) ; - Row [r].start = (int) (pdest - &A [0]) ; - length = Row [r].length ; - for (j = 0 ; j < length ; j++) - { - c = *psrc++ ; - if (COL_IS_ALIVE (c)) - { - *pdest++ = c ; - } - } - Row [r].length = (int) (pdest - &A [Row [r].start]) ; -#ifndef NDEBUG - debug_rows-- ; -#endif - } - } - /* ensure we found all the rows */ - assert (debug_rows == 0) ; - - /* === Return the new value of pfree ==================================== */ - - return ((int) (pdest - &A [0])) ; -} - - -/* ========================================================================== */ -/* === clear_mark =========================================================== */ -/* ========================================================================== */ - -/* - Clears the Row [].shared2.mark array, and returns the new tag_mark. - Return value is the new tag_mark. Not user-callable. -*/ - -PRIVATE int clear_mark /* return the new value for tag_mark */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows in A */ - RowInfo Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ -) -{ - /* === Local variables ================================================== */ - - int r ; - - DEBUG0 (("Clear mark\n")) ; - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - Row [r].shared2.mark = 0 ; - } - } - return (1) ; -} - - -/* ========================================================================== */ -/* === debugging routines =================================================== */ -/* ========================================================================== */ - -/* When debugging is disabled, the remainder of this file is ignored. */ - -#ifndef NDEBUG - - -/* ========================================================================== */ -/* === debug_structures ===================================================== */ -/* ========================================================================== */ - -/* - At this point, all empty rows and columns are dead. All live columns - are "clean" (containing no dead rows) and simplicial (no supercolumns - yet). Rows may contain dead columns, but all live rows contain at - least one live column. -*/ - -PRIVATE void debug_structures -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int n_col2 -) -{ - /* === Local variables ================================================== */ - - int i ; - int c ; - int *cp ; - int *cp_end ; - int len ; - int score ; - int r ; - int *rp ; - int *rp_end ; - int deg ; - - /* === Check A, Row, and Col ============================================ */ - - for (c = 0 ; c < n_col ; c++) - { - if (COL_IS_ALIVE (c)) - { - len = Col [c].length ; - score = Col [c].shared2.score ; - DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ; - assert (len > 0) ; - assert (score >= 0) ; - assert (Col [c].shared1.thickness == 1) ; - cp = &A [Col [c].start] ; - cp_end = cp + len ; - while (cp < cp_end) - { - r = *cp++ ; - assert (ROW_IS_ALIVE (r)) ; - } - } - else - { - i = Col [c].shared2.order ; - assert (i >= n_col2 && i < n_col) ; - } - } - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - i = 0 ; - len = Row [r].length ; - deg = Row [r].shared1.degree ; - assert (len > 0) ; - assert (deg > 0) ; - rp = &A [Row [r].start] ; - rp_end = rp + len ; - while (rp < rp_end) - { - c = *rp++ ; - if (COL_IS_ALIVE (c)) - { - i++ ; - } - } - assert (i > 0) ; - } - } -} - - -/* ========================================================================== */ -/* === debug_deg_lists ====================================================== */ -/* ========================================================================== */ - -/* - Prints the contents of the degree lists. Counts the number of columns - in the degree list and compares it to the total it should have. Also - checks the row degrees. -*/ - -PRIVATE void debug_deg_lists -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int head [], - int min_score, - int should, - int max_deg -) -{ - /* === Local variables ================================================== */ - - int deg ; - int col ; - int have ; - int row ; - - /* === Check the degree lists =========================================== */ - - if (n_col > 10000 && debug_colamd <= 0) - { - return ; - } - have = 0 ; - DEBUG4 (("Degree lists: %d\n", min_score)) ; - for (deg = 0 ; deg <= n_col ; deg++) - { - col = head [deg] ; - if (col == EMPTY) - { - continue ; - } - DEBUG4 (("%d:", deg)) ; - while (col != EMPTY) - { - DEBUG4 ((" %d", col)) ; - have += Col [col].shared1.thickness ; - assert (COL_IS_ALIVE (col)) ; - col = Col [col].shared4.degree_next ; - } - DEBUG4 (("\n")) ; - } - DEBUG4 (("should %d have %d\n", should, have)) ; - assert (should == have) ; - - /* === Check the row degrees ============================================ */ - - if (n_row > 10000 && debug_colamd <= 0) - { - return ; - } - for (row = 0 ; row < n_row ; row++) - { - if (ROW_IS_ALIVE (row)) - { - assert (Row [row].shared1.degree <= max_deg) ; - } - } -} - - -/* ========================================================================== */ -/* === debug_mark =========================================================== */ -/* ========================================================================== */ - -/* - Ensures that the tag_mark is less that the maximum and also ensures that - each entry in the mark array is less than the tag mark. -*/ - -PRIVATE void debug_mark -( - /* === Parameters ======================================================= */ - - int n_row, - RowInfo Row [], - int tag_mark, - int max_mark -) -{ - /* === Local variables ================================================== */ - - int r ; - - /* === Check the Row marks ============================================== */ - - assert (tag_mark > 0 && tag_mark <= max_mark) ; - if (n_row > 10000 && debug_colamd <= 0) - { - return ; - } - for (r = 0 ; r < n_row ; r++) - { - assert (Row [r].shared2.mark < tag_mark) ; - } -} - - -/* ========================================================================== */ -/* === debug_matrix ========================================================= */ -/* ========================================================================== */ - -/* - Prints out the contents of the columns and the rows. -*/ - -PRIVATE void debug_matrix -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [] -) -{ - /* === Local variables ================================================== */ - - int r ; - int c ; - int *rp ; - int *rp_end ; - int *cp ; - int *cp_end ; - - /* === Dump the rows and columns of the matrix ========================== */ - - if (debug_colamd < 3) - { - return ; - } - DEBUG3 (("DUMP MATRIX:\n")) ; - for (r = 0 ; r < n_row ; r++) - { - DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ; - if (ROW_IS_DEAD (r)) - { - continue ; - } - DEBUG3 (("start %d length %d degree %d\n", - Row [r].start, Row [r].length, Row [r].shared1.degree)) ; - rp = &A [Row [r].start] ; - rp_end = rp + Row [r].length ; - while (rp < rp_end) - { - c = *rp++ ; - DEBUG3 ((" %d col %d\n", COL_IS_ALIVE (c), c)) ; - } - } - - for (c = 0 ; c < n_col ; c++) - { - DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ; - if (COL_IS_DEAD (c)) - { - continue ; - } - DEBUG3 (("start %d length %d shared1 %d shared2 %d\n", - Col [c].start, Col [c].length, - Col [c].shared1.thickness, Col [c].shared2.score)) ; - cp = &A [Col [c].start] ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - r = *cp++ ; - DEBUG3 ((" %d row %d\n", ROW_IS_ALIVE (r), r)) ; - } - } -} - -#endif - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,73 +0,0 @@ -/* ========================================================================== */ -/* === colamd prototypes and definitions ==================================== */ -/* ========================================================================== */ - -/* - This is the colamd include file, - - http://www.cise.ufl.edu/~davis/colamd/colamd.h - - for use in the colamd.c, colamdmex.c, and symamdmex.c files located at - - http://www.cise.ufl.edu/~davis/colamd/ - - See those files for a description of colamd and symamd, and for the - copyright notice, which also applies to this file. - - August 3, 1998. Version 1.0. -*/ - -/* ========================================================================== */ -/* === Definitions ========================================================== */ -/* ========================================================================== */ - -/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -#define COLAMD_KNOBS 20 - -/* number of output statistics. Only A [0..2] are currently used. */ -#define COLAMD_STATS 20 - -/* knobs [0] and A [0]: dense row knob and output statistic. */ -#define COLAMD_DENSE_ROW 0 - -/* knobs [1] and A [1]: dense column knob and output statistic. */ -#define COLAMD_DENSE_COL 1 - -/* A [2]: memory defragmentation count output statistic */ -#define COLAMD_DEFRAG_COUNT 2 - -/* A [3]: whether or not the input columns were jumbled or had duplicates */ -#define COLAMD_JUMBLED_COLS 3 - -/* ========================================================================== */ -/* === Prototypes of user-callable routines ================================= */ -/* ========================================================================== */ - -#ifdef _CRAY -#define int short -#elif defined (_LONGINT) -#define int long -#endif - -int colamd_recommended /* returns recommended value of Alen */ -( - int nnz, /* nonzeros in A */ - int n_row, /* number of rows in A */ - int n_col /* number of columns in A */ -) ; - -void colamd_set_defaults /* sets default parameters */ -( /* knobs argument is modified on output */ - double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ -) ; - -int colamd /* returns TRUE if successful, FALSE otherwise*/ -( /* A and p arguments are modified on output */ - int n_row, /* number of rows in A */ - int n_col, /* number of columns in A */ - int Alen, /* size of the array A */ - int A [], /* row indices of A, of size Alen */ - int p [], /* column pointers of A, of size n_col+1 */ - double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ -) ; - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pddistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pddistribute.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pddistribute.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pddistribute.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1044 +0,0 @@ - - -#include "superlu_ddefs.h" - -int_t -dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, - gridinfo_t *grid, int_t *colptr[], int_t *rowind[], - double *a[]) -{ -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * Purpose - * ======= - * Re-distribute A on the 2D process mesh. - * - * Arguments - * ========= - * - * A (input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. - * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * colptr (output) int* - * - * rowind (output) int* - * - * a (output) double* - * - * Return value - * ============ - * - */ - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; - int_t nnz_loc; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - double *aij, **aij_send, *nzval, *dtemp; - double *nzval_a; - int iam, it, p, procs; - MPI_Request *send_req; - MPI_Status status; - - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dReDistribute_A()"); -#endif - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - nnzToRecv = intCalloc_dist(2*procs); - nnzToSend = nnzToRecv + procs; - - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); - - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - - /* Allocate space for storing the triplets after redistribution. */ - if ( k ) { /* count can be zero. */ - if ( !(ia = intMalloc_dist(2*k)) ) - ABORT("Malloc fails for ia[]."); - if ( !(aij = doubleMalloc_dist(k)) ) - ABORT("Malloc fails for aij[]."); - } - ja = ia + k; - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) - ABORT("Malloc fails for ia_send[]."); - if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) - ABORT("Malloc fails for aij_send[]."); - if ( SendCnt ) { /* count can be zero */ - if ( !(index = intMalloc_dist(2*SendCnt)) ) - ABORT("Malloc fails for index[]."); - if ( !(nzval = doubleMalloc_dist(SendCnt)) ) - ABORT("Malloc fails for nzval[]."); - } - if ( !(ptr_to_send = intCalloc_dist(procs)) ) - ABORT("Malloc fails for ptr_to_send[]."); - if ( maxnnzToRecv ) { /* count can be zero */ - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) - ABORT("Malloc fails for itemp[]."); - if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) - ABORT("Malloc fails for dtemp[]."); - } - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - if ( !(*colptr = intCalloc_dist(n+1)) ) - ABORT("Malloc fails for *colptr[]."); - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - ++(*colptr)[jcol]; /* Count nonzeros in each column */ - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, MPI_DOUBLE, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /*assert(jcol 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - if ( SendCnt ) { - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - } - SUPERLU_FREE(ptr_to_send); - if ( maxnnzToRecv ) { - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - } - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. - ------------------------------------------------------------*/ - if ( nnz_loc ) { /* nnz_loc can be zero */ - if ( !(*rowind = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *rowind[]."); - if ( !(*a = doubleMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *a[]."); - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = (*colptr)[0]; - (*colptr)[0] = 0; - for (j = 1; j < n; ++j) { - k += jsize; - jsize = (*colptr)[j]; - (*colptr)[j] = k; - } - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - j = ja[i]; - k = (*colptr)[j]; - (*rowind)[k] = ia[i]; - (*a)[k] = aij[i]; - ++(*colptr)[j]; - } - - /* Reset the column pointers to the beginning of each column */ - for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; - (*colptr)[0] = 0; - - if ( nnz_loc ) { - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - } - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dReDistribute_A()"); -#endif - -} /* dReDistribute_A */ - -int_t -pddistribute(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, - gridinfo_t *grid) -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * > 0, working storage required (in bytes). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, - len, len1, nsupc; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - double *a; - int_t *asub, *xa; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - double *lusup, *uval; /* nonzero values in L and U */ - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t mem_use = 0, iword, dword; - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif -#if ( PROFlevel>=1 ) - double t, t_u, t_l; - int_t u_blks; -#endif - - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = (NRformat_loc *) A->Store; - -#if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - dword = sizeof(double); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pddistribute()"); -#endif -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, - grid, &xa, &asub, &a); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf("--------\n" - ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); -#endif - - if ( fact == SamePattern_SameRowPerm ) { - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; -#if ( PRNTlevel>=1 ) - mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; -#endif -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - /* Initialize Uval to zero. */ - for (lb = 0; lb < nrbu; ++lb) { - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - index = Ufstnz_br_ptr[lb]; - if ( index ) { - uval = Unzval_br_ptr[lb]; - len = index[1]; - for (i = 0; i < len; ++i) uval[i] = zero; - } /* if index != NULL */ - } /* for lb ... */ - - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA (for L), or into U directly. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - if ( gb < jb ) { /* in U */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - while ( (k = index[Urb_indptr[lb]]) < jb ) { - /* Skip nonzero values in this block */ - Urb_length[lb] += index[Urb_indptr[lb]+1]; - /* Move pointer to the next block */ - Urb_indptr[lb] += UB_DESCRIPTOR - + SuperSize( k ); - } - /*assert(k == jb);*/ - /* start fstnz */ - istart = Urb_indptr[lb] + UB_DESCRIPTOR; - len = Urb_length[lb]; - fsupc1 = FstBlockC( gb+1 ); - k = j - fsupc; - /* Sum the lengths of the leading columns */ - for (jj = 0; jj < k; ++jj) - len += fsupc1 - index[istart++]; - /*assert(irow>=index[istart]);*/ - uval[len + irow - index[istart]] = a[i]; - } else { /* in L; put in SPA first */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", - t_l, t_u, u_blks, nrbu); -#endif - - } else { - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for ToRecv[]."); - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) - ABORT("Malloc fails for index[]."); -#if ( PRNTlevel>=1 ) - mem_use += k*sizeof(int_t*) + (j + nsupers)*iword; -#endif - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = intCalloc_dist(k)) ) - ABORT("Malloc fails for ToSendD[]."); - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); -#if ( PRNTlevel>=1 ) - mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword; -#endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) - ABORT("Malloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Urb_fstnz[lb] = BR_HEADER; - } /* for lb ... */ - - SUPERLU_FREE(Ucbs); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); -#endif -#if ( PRNTlevel>=1 ) - mem_use -= 2*k * iword; -#endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); - /* ------------------------------------------------ */ -#if ( PRNTlevel>=1 ) - mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; -#endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ -#if ( PRNTlevel>=1 ) - mem_use += 4*k*sizeof(int_t*) + 2*len*iword; -#endif - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } /* for j ... */ - - jbrow = PROW( jb, grid ); - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - kseen = 0; - dense_col = dense; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - bsendx_plist[ljb][pr] == EMPTY ) { - bsendx_plist[ljb][pr] = YES; - ++nbsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - fsupc1 = FstBlockC( gb+1 ); - if (rb_marker[lb] <= jb) { /* First time see - the block */ - rb_marker[lb] = jb + 1; - Urb_indptr[lb] = Urb_fstnz[lb];; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - Urb_indptr[lb] += UB_DESCRIPTOR; - /* Record the first location in index[] of the - next block */ - Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; - len = Urb_indptr[lb];/* Start fstnz in index */ - index[len-1] = 0; - for (k = 0; k < nsupc; ++k) - index[len+k] = fsupc1; - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { /* Already saw the block */ - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - /* Load the numerical values */ - k = fsupc1 - irow; /* No. of nonzeros in segment */ - index[len-1] += k; /* Increment block length in - Descriptor */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (ii = 0; ii < k; ++ii) { - uval[Urb_length[lb]++] = dense_col[irow + ii]; - dense_col[irow + ii] = zero; - } - } /* if myrow == pr ... */ - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { - fsendx_plist[ljb][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - Lrowind_bc_ptr[ljb] = index; - if (!(Lnzval_bc_ptr[ljb] = - doubleMalloc_dist(len*nsupc))) { - fprintf(stderr, "col block %d ", jb); - ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); - } - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - lusup = Lnzval_bc_ptr[ljb]; - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb] = NULL; - Lnzval_bc_ptr[ljb] = NULL; - } /* if nrbl ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - - } /* for jb ... */ - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); -#endif - - } /* else fact != SamePattern_SameRowPerm */ - - if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ - SUPERLU_FREE(asub); - SUPERLU_FREE(a); - } - SUPERLU_FREE(xa); - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit pddistribute()"); -#endif - - return (mem_use); -} /* PDDISTRIBUTE */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsequ.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,231 +0,0 @@ - - -/* - * File name: pdgsequ.c - * History: Modified from LAPACK routine DGEEQU - */ -#include -#include "superlu_ddefs.h" - -void -pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int_t *info, gridinfo_t *grid) -{ -/* - Purpose - ======= - - PDGSEQU computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - - R (output) double*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) double*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) double* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) double* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) double* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= M: the i-th row of A is exactly zero - > M: the (i-M)-th column of A is exactly zero - - GRID (input) gridinof_t* - The 2D process mesh. - ===================================================================== -*/ - - /* Local variables */ - NRformat_loc *Astore; - double *Aval; - int i, j, irow, jcol, m_loc; - double rcmin, rcmax; - double bignum, smlnum; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(char *); - /* extern double dlamch_(char *); */ - double tempmax, tempmin; - double *loc_max; - int *r_sizes, *displs; - double *loc_r; - int_t procs; - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("pdgsequ", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - m_loc = Astore->m_loc; - - /* Get machine constants. */ - smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("S"); - /* smlnum = dlamch_("S"); */ - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); - ++irow; - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - - /* Get the global MAX and MIN for R */ - tempmax = rcmax; - tempmin = rcmin; - MPI_Allreduce( &tempmax, &rcmax, - 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempmin, &rcmin, - 1, MPI_DOUBLE, MPI_MIN, grid->comm); - - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); - } - ++irow; - } - - /* Find the global maximum for c[j] */ - if ( !(loc_max = doubleMalloc_dist(A->ncol))) - ABORT("Malloc fails for loc_max[]."); - for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; - MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm); - SUPERLU_FREE(loc_max); - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* gather R from each process to get the global R. */ - - procs = grid->nprow * grid->npcol; - if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) - ABORT("Malloc fails for r_sizes[]."); - displs = r_sizes + procs; - if ( !(loc_r = doubleMalloc_dist(m_loc))) - ABORT("Malloc fails for loc_r[]."); - j = Astore->fst_row; - for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; - - /* First gather the size of each piece. */ - MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm); - - /* Set up the displacements for allgatherv */ - displs[0] = 0; - for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; - - /* Now gather the actual data */ - MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs, - MPI_DOUBLE, grid->comm); - - SUPERLU_FREE(r_sizes); - SUPERLU_FREE(loc_r); - - return; - -} /* pdgsequ */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,301 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include -#include "superlu_ddefs.h" - - -static void dcreate_msr_matrix(SuperMatrix *, int_t [], int_t, - double **, int_t **); -static void dPrintMSRmatrix(int, double [], int_t [], gridinfo_t *); - - -int pdgsmv_AXglobal_setup -( - SuperMatrix *A, /* Matrix A permuted by columns (input). - The type of A can be: - Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */ - Glu_persist_t *Glu_persist, /* input */ - gridinfo_t *grid, /* input */ - int_t *m, /* output */ - int_t *update[], /* output */ - double *val[], /* output */ - int_t *bindx[], /* output */ - int_t *mv_sup_to_proc /* output */ - ) -{ - int n; - int input_option; - int N_update; /* Number of variables updated on this process (output) */ - int iam = grid->iam; - int nprocs = grid->nprow * grid->npcol; - int_t *xsup = Glu_persist->xsup; - int_t *supno = Glu_persist->supno; - int_t nsupers; - int i, nsup, p, t1, t2, t3; - - - /* Initialize the list of global indices. - * NOTE: the list of global indices must be in ascending order. - */ - n = A->nrow; - input_option = SUPER_LINEAR; - nsupers = supno[n-1] + 1; - -#if ( DEBUGlevel>=2 ) - if ( !iam ) { - PrintInt10("xsup", supno[n-1]+1, xsup); - PrintInt10("supno", n, supno); - } -#endif - - if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on - individual rows. */ - /* Figure out mv_sup_to_proc[] on all processes. */ - for (p = 0; p < nprocs; ++p) { - t1 = n / nprocs; /* Number of rows */ - t2 = n - t1 * nprocs; /* left-over, which will be assigned - to the first t2 processes. */ - if ( p >= t2 ) t2 += (p * t1); /* Starting row number */ - else { /* First t2 processes will get one more row. */ - ++t1; /* Number of rows. */ - t2 = p * t1; /* Starting row. */ - } - /* Make sure the starting and ending rows are at the - supernode boundaries. */ - t3 = t2 + t1; /* Ending row. */ - nsup = supno[t2]; - if ( t2 > xsup[nsup] ) { /* Round up the starting row. */ - t1 -= xsup[nsup+1] - t2; - t2 = xsup[nsup+1]; - } - nsup = supno[t3]; - if ( t3 > xsup[nsup] ) /* Round up the ending row. */ - t1 += xsup[nsup+1] - t3; - t3 = t2 + t1 - 1; - if ( t1 ) { - for (i = supno[t2]; i <= supno[t3]; ++i) { - mv_sup_to_proc[i] = p; -#if ( DEBUGlevel>=3 ) - if ( mv_sup_to_proc[i] == p-1 ) { - fprintf(stderr, - "mv_sup_to_proc conflicts at supno %d\n", i); - exit(-1); - } -#endif - } - } - - if ( iam == p ) { - N_update = t1; - if ( N_update ) { - if ( !(*update = intMalloc_dist(N_update)) ) - ABORT("Malloc fails for update[]"); - } - for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i; -#if ( DEBUGlevel>=3 ) - printf("(%2d) N_update = %4d\t" - "supers %4d to %4d\trows %4d to %4d\n", - iam, N_update, supno[t2], supno[t3], t2, t3); -#endif - } - } /* for p ... */ - } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on - individual supernodes. */ - /* This may cause bad load balance, because the blocks are usually - small in the beginning and large toward the end. */ - t1 = nsupers / nprocs; - t2 = nsupers - t1 * nprocs; /* left-over */ - if ( iam >= t2 ) t2 += (iam * t1); - else { - ++t1; /* Number of blocks. */ - t2 = iam * t1; /* Starting block. */ - } - N_update = xsup[t2+t1] - xsup[t2]; - if ( !(*update = intMalloc_dist(N_update)) ) - ABORT("Malloc fails for update[]"); - for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i; - } - - - /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */ - dcreate_msr_matrix(A, *update, N_update, val, bindx); - -#if ( DEBUGlevel>=2 ) - PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc); - dPrintMSRmatrix(N_update, *val, *bindx, grid); -#endif - - *m = N_update; - return 0; -} /* PDGSMV_AXglobal_SETUP */ - - -/* Create the distributed modified sparse row (MSR) matrix: bindx/val. - * For a submatrix of size m-by-n, the MSR arrays are as follows: - * bindx[0] = m + 1 - * bindx[0..m] = pointer to start of each row - * bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k, - * where, ks = bindx[k], ke = bindx[k+1]-1 - * val[k] = A(k,k), k < m, diagonal elements - * val[m] = not used - * val[ki] = A(k, bindx[ki]), where ks <= ki <= ke - * Both arrays are of length nnz + 1. - */ -static void dcreate_msr_matrix -( - SuperMatrix *A, /* Matrix A permuted by columns (input). - The type of A can be: - Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */ - int_t update[], /* input (local) */ - int_t N_update, /* input (local) */ - double **val, /* output */ - int_t **bindx /* output */ -) -{ - int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag; - NCPformat *Astore; - double *nzval; - int_t *rowcnt; - double zero = 0.0; - - if ( !N_update ) return; - - n = A->ncol; - Astore = A->Store; - nzval = Astore->nzval; - - /* One pass of original matrix A to count nonzeros of each row. */ - if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) ) - ABORT("Malloc fails for rowcnt[]"); - lo = update[0]; - hi = update[N_update-1]; - nnz_local = 0; - nnz_diag = 0; - for (j = 0; j < n; ++j) { - for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { - irow = Astore->rowind[i]; - if ( irow >= lo && irow <= hi ) { - if ( irow != j ) /* Exclude diagonal */ - ++rowcnt[irow - lo]; - else ++nnz_diag; /* Count nonzero diagonal entries */ - ++nnz_local; - } - } - } - - /* Add room for the logical diagonal zeros which are not counted - in nnz_local. */ - nnz_local += (N_update - nnz_diag); - - /* Allocate storage for bindx[] and val[]. */ - if ( !(*val = (double *) doubleMalloc_dist(nnz_local+1)) ) - ABORT("Malloc fails for val[]"); - for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */ - if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) ) - ABORT("Malloc fails for bindx[]"); - - /* Set up row pointers. */ - (*bindx)[0] = N_update + 1; - for (j = 1; j <= N_update; ++j) { - (*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1]; - rowcnt[j-1] = (*bindx)[j-1]; - } - - /* One pass of original matrix A to fill in matrix entries. */ - for (j = 0; j < n; ++j) { - for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { - irow = Astore->rowind[i]; - if ( irow >= lo && irow <= hi ) { - if ( irow == j ) /* Diagonal */ - (*val)[irow - lo] = nzval[i]; - else { - irow -= lo; - k = rowcnt[irow]; - (*bindx)[k] = j; - (*val)[k] = nzval[i]; - ++rowcnt[irow]; - } - } - } - } - - SUPERLU_FREE(rowcnt); -} - -/* - * Performs sparse matrix-vector multiplication. - * - val/bindx stores the distributed MSR matrix A - * - X is global - * - ax product is distributed the same way as A - */ -int -pdgsmv_AXglobal(int_t m, int_t update[], double val[], int_t bindx[], - double X[], double ax[]) -{ - int_t i, j, k; - - if ( m <= 0 ) return; /* number of rows (local) */ - - for (i = 0; i < m; ++i) { - ax[i] = 0.0; - - for (k = bindx[i]; k < bindx[i+1]; ++k) { - j = bindx[k]; /* column index */ - ax[i] += val[k] * X[j]; - } - ax[i] += val[i] * X[update[i]]; /* diagonal */ - } -} /* PDGSMV_AXglobal */ - -/* - * Performs sparse matrix-vector multiplication. - * - val/bindx stores the distributed MSR matrix A - * - X is global - * - ax product is distributed the same way as A - */ -int -pdgsmv_AXglobal_abs(int_t m, int_t update[], double val[], int_t bindx[], - double X[], double ax[]) -{ - int_t i, j, k; - - if ( m <= 0 ) return; /* number of rows (local) */ - - for (i = 0; i < m; ++i) { - ax[i] = 0.0; - for (k = bindx[i]; k < bindx[i+1]; ++k) { - j = bindx[k]; /* column index */ - ax[i] += fabs(val[k]) * fabs(X[j]); - } - ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */ - } -} /* PDGSMV_AXglobal_ABS */ - -/* - * Print the local MSR matrix - */ -static void dPrintMSRmatrix -( - int m, /* Number of rows of the submatrix. */ - double val[], - int_t bindx[], - gridinfo_t *grid -) -{ - int iam, nnzp1; - - if ( !m ) return; - - iam = grid->iam; - nnzp1 = bindx[m]; - printf("(%2d) MSR submatrix has %d rows -->\n", iam, m); - PrintDouble5("val", nnzp1, val); - PrintInt10("bindx", nnzp1, bindx); -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,370 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_ddefs.h" - -void pdgsmv_init -( - SuperMatrix *A, /* Matrix A permuted by columns (input/output). - The type of A can be: - Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */ - int_t *row_to_proc, /* Input. Mapping between rows and processes. */ - gridinfo_t *grid, /* Input */ - pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ - ) -{ - NRformat_loc *Astore; - int iam, p, procs; - int *SendCounts, *RecvCounts; - int_t i, j, k, l, m, m_loc, n, fst_row, jcol; - int_t TotalIndSend, TotalValSend; - int_t *colind, *rowptr; - int_t *ind_tosend = NULL, *ind_torecv = NULL; - int_t *ptr_ind_tosend, *ptr_ind_torecv; - int_t *extern_start, *spa, *itemp; - double *nzval, *val_tosend = NULL, *val_torecv = NULL, t; - MPI_Request *send_req, *recv_req; - MPI_Status status; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - m = A->nrow; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - colind = Astore->colind; - rowptr = Astore->rowptr; - nzval = Astore->nzval; - if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) - ABORT("Malloc fails for SendCounts[]"); - /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ - RecvCounts = SendCounts + procs; - if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) - ABORT("Malloc fails for ptr_ind_tosend[]"); - ptr_ind_torecv = ptr_ind_tosend + procs + 1; - if ( !(extern_start = intMalloc_dist(m_loc)) ) - ABORT("Malloc fails for extern_start[]"); - for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. - THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. - SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE - LOCAL PART OF X. - THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. - ------------------------------------------------------------*/ - if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ - ABORT("Malloc fails for spa[]"); - for (p = 0; p < procs; ++p) SendCounts[p] = 0; - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = extern_start[i]; - for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ - jcol = colind[j]; - p = row_to_proc[jcol]; - if ( p != iam ) { /* External */ - if ( spa[jcol] == 0 ) { /* First time see this index */ - ++SendCounts[p]; - spa[jcol] = 1; - } - } else { /* Swap to beginning the part of A corresponding - to the local part of X */ - l = colind[k]; - t = nzval[k]; - colind[k] = jcol; - nzval[k] = nzval[j]; - colind[j] = l; - nzval[j] = t; - ++k; - } - } - extern_start[i] = k; - } - - /* ------------------------------------------------------------ - LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. - THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. - ------------------------------------------------------------*/ - /* Build pointers to ind_tosend[]. */ - ptr_ind_tosend[0] = 0; - for (p = 0, TotalIndSend = 0; p < procs; ++p) { - TotalIndSend += SendCounts[p]; /* Total to send. */ - ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; - } -#if 0 - ptr_ind_tosend[iam] = 0; /* Local part of X */ -#endif - if ( TotalIndSend ) { - if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) - ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ - } - - /* Build SPA to aid global to local translation. */ - for (i = 0; i < n; ++i) spa[i] = EMPTY; - for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - if ( spa[jcol] == EMPTY ) { /* First time see this index */ - p = row_to_proc[jcol]; - if ( p == iam ) { /* Local */ - /*assert(jcol>=fst_row);*/ - spa[jcol] = jcol - fst_row; /* Relative position in local X */ - } else { /* External */ - ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ - spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ - ++ptr_ind_tosend[p]; - } - } - } - } - - /* ------------------------------------------------------------ - TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. - THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - colind[j] = spa[jcol]; - } - } - - /* ------------------------------------------------------------ - COMMUNICATE THE EXTERNAL INDICES OF X. - ------------------------------------------------------------*/ - MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, - grid->comm); - - /* Build pointers to ind_torecv[]. */ - ptr_ind_torecv[0] = 0; - for (p = 0, TotalValSend = 0; p < procs; ++p) { - TotalValSend += RecvCounts[p]; /* Total to receive. */ - ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; - } - if ( TotalValSend ) { - if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) - ABORT("Malloc fails for ind_torecv[]"); - } - - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) - ABORT("Malloc fails for recv_req[]."); - recv_req = send_req + procs; - for (p = 0; p < procs; ++p) { - ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ - if ( SendCounts[p] ) { - MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], - mpi_int_t, p, iam, grid->comm, &send_req[p]); - } - if ( RecvCounts[p] ) { - MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], - mpi_int_t, p, p, grid->comm, &recv_req[p]); - } - } - for (p = 0; p < procs; ++p) { - if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); - if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); - } - - /* Allocate storage for the X values to to transferred. */ - if ( TotalIndSend && - !(val_torecv = doubleMalloc_dist(TotalIndSend)) ) - ABORT("Malloc fails for val_torecv[]."); - if ( TotalValSend && - !(val_tosend = doubleMalloc_dist(TotalValSend)) ) - ABORT("Malloc fails for val_tosend[]."); - - gsmv_comm->extern_start = extern_start; - gsmv_comm->ind_tosend = ind_tosend; - gsmv_comm->ind_torecv = ind_torecv; - gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; - gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; - gsmv_comm->SendCounts = SendCounts; - gsmv_comm->RecvCounts = RecvCounts; - gsmv_comm->val_tosend = val_tosend; - gsmv_comm->val_torecv = val_torecv; - gsmv_comm->TotalIndSend = TotalIndSend; - gsmv_comm->TotalValSend = TotalValSend; - - SUPERLU_FREE(spa); - SUPERLU_FREE(send_req); - -#if ( DEBUGlevel>=2 ) - PrintInt10("pdgsmv_init::rowptr", m_loc+1, rowptr); - PrintInt10("pdgsmv_init::extern_start", m_loc, extern_start); -#endif -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgsmv_init()"); -#endif - -} /* PDGSMV_INIT */ - - -/* - * Performs sparse matrix-vector multiplication. - */ -void -pdgsmv -( - int_t abs, /* Input. Do abs(A)*abs(x). */ - SuperMatrix *A_internal, /* Input. Matrix A permuted by columns. - The column indices are translated into - the relative positions in the gathered x-vector. - The type of A can be: - Stype = NR_loc; Dtype = SLU_D; Mtype = GE. */ - gridinfo_t *grid, /* Input */ - pdgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */ - double x[], /* Input. The distributed source vector */ - double ax[] /* Output. The distributed destination vector */ -) -{ - NRformat_loc *Astore; - int iam, procs; - int_t i, j, p, m, m_loc, n, fst_row, jcol; - int_t *colind, *rowptr; - int *SendCounts, *RecvCounts; - int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv; - int_t *extern_start, TotalValSend; - double *nzval, *val_tosend, *val_torecv; - double zero = 0.0; - MPI_Request *send_req, *recv_req; - MPI_Status status; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdgsmv()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A_internal->Store; - m = A_internal->nrow; - n = A_internal->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - colind = Astore->colind; - rowptr = Astore->rowptr; - nzval = (double *) Astore->nzval; - extern_start = gsmv_comm->extern_start; - ind_torecv = gsmv_comm->ind_torecv; - ptr_ind_tosend = gsmv_comm->ptr_ind_tosend; - ptr_ind_torecv = gsmv_comm->ptr_ind_torecv; - SendCounts = gsmv_comm->SendCounts; - RecvCounts = gsmv_comm->RecvCounts; - val_tosend = (double *) gsmv_comm->val_tosend; - val_torecv = (double *) gsmv_comm->val_torecv; - TotalValSend = gsmv_comm->TotalValSend; - - /* ------------------------------------------------------------ - COPY THE X VALUES INTO THE SEND BUFFER. - ------------------------------------------------------------*/ - for (i = 0; i < TotalValSend; ++i) { - j = ind_torecv[i] - fst_row; /* Relative index in x[] */ - val_tosend[i] = x[j]; - } - - /* ------------------------------------------------------------ - COMMUNICATE THE X VALUES. - ------------------------------------------------------------*/ - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) - ABORT("Malloc fails for recv_req[]."); - recv_req = send_req + procs; - for (p = 0; p < procs; ++p) { - if ( RecvCounts[p] ) { - MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p], - MPI_DOUBLE, p, iam, - grid->comm, &send_req[p]); - } - if ( SendCounts[p] ) { - MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p], - MPI_DOUBLE, p, p, - grid->comm, &recv_req[p]); - } - } - - /* ------------------------------------------------------------ - PERFORM THE ACTUAL MULTIPLICATION. - ------------------------------------------------------------*/ - if ( abs ) { /* Perform abs(A)*abs(x) */ - /* Multiply the local part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - ax[i] = 0.0; - for (j = rowptr[i]; j < extern_start[i]; ++j) { - jcol = colind[j]; - ax[i] += fabs(nzval[j]) * fabs(x[jcol]); - } - } - - for (p = 0; p < procs; ++p) { - if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); - if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); - } - - /* Multiply the external part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - for (j = extern_start[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - ax[i] += fabs(nzval[j]) * fabs(val_torecv[jcol]); - } - } - } else { - /* Multiply the local part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - ax[i] = zero; - for (j = rowptr[i]; j < extern_start[i]; ++j) { - jcol = colind[j]; - ax[i] += nzval[j] * x[jcol]; - } - } - - for (p = 0; p < procs; ++p) { - if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); - if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); - } - - /* Multiply the external part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - for (j = extern_start[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - ax[i] += nzval[j] * val_torecv[jcol]; - } - } - } - - SUPERLU_FREE(send_req); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgsmv()"); -#endif - -} /* PDGSMV */ - -void pdgsmv_finalize(pdgsmv_comm_t *gsmv_comm) -{ - int_t *it; - double *dt; - SUPERLU_FREE(gsmv_comm->extern_start); - if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); - if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); - SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); - SUPERLU_FREE(gsmv_comm->SendCounts); - if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); - if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,440 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include -#include "superlu_ddefs.h" - -/*-- Function prototypes --*/ -static void gather_1rhs_diag_to_all(int_t, double [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], double [], double []); -static void redist_all_to_diag(int_t, double [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t [], double []); - -void -pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx, - int nrhs, double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pdgsrfs_ABXglobal improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc - * are permutation matrices. The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. - * - * NOTE: Currently, A must reside in all processes when calling - * this routine. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input) double* (global) - * The N-by-NRHS right-hand side matrix of the possibly equilibrated - * and row permuted system. - * - * NOTE: Currently, B must reside on all processes when calling - * this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * X (input/output) double* (global) - * On entry, the solution matrix X, as computed by PDGSTRS. - * On exit, the improved solution matrix X. - * If DiagScale = COL or BOTH, X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * NOTE: Currently, X must reside on all processes when calling - * this routine. - * - * ldx (input) int (global) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - /* - * Data structures used by matrix-vector multiply routine. - */ - int_t N_update; /* Number of variables updated on this process */ - int_t *update; /* vector elements (global index) updated - on this processor. */ - int_t *bindx; - double *val; - int_t *mv_sup_to_proc; /* Supernode to process mapping in - matrix-vector multiply. */ - /*-- end data structures for matrix-vector multiply --*/ - double *b, *ax, *R, *B_col, *temp, *work, *X_col, - *x_trs, *dx_trs; - int_t count, ii, j, jj, k, knsupc, lk, lwork, - nprow, nsupers, nz, p; - int i, iam, pkk; - int_t *ilsum, *xsup; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* NEW STUFF */ - int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ - int_t *diag_len; /* Length of the X vector on diagonal processes. */ - - /*-- Function prototypes --*/ - extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *, - double *, int, SuperLUStat_t *, int *); - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - xerbla_("pdgsrfs_ABXglobal", &i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - /* Initialization. */ - iam = grid->iam; - nprow = grid->nprow; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()"); -#endif - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. number of diag processes = %d\n", num_diag_procs); - PrintInt10("diag_procs", num_diag_procs, diag_procs); - PrintInt10("diag_len", num_diag_procs, diag_len); - } -#endif - - if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for mv_sup_to_proc[]"); - - pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, - &val, &bindx, mv_sup_to_proc); - - i = CEILING( nsupers, nprow ); /* Number of local block rows */ - ii = Llu->ldalsum + i * XK_H; - k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); - jj = SUPERLU_MAX( jj, N_update ); - lwork = N_update /* For ax and R */ - + ii /* For dx_trs */ - + ii /* For x_trs */ - + k /* For b */ - + jj; /* for temp */ - if ( !(work = doubleMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = work; - dx_trs = work + N_update; - x_trs = dx_trs + ii; - b = x_trs + ii; - temp = b + k; - -#if ( DEBUGlevel>=2 ) - { - double *dwork = doubleMalloc_dist(n); - for (i = 0; i < n; ++i) { - if ( i & 1 ) dwork[i] = 1.; - else dwork[i] = 2.; - } - /* Check correctness of matrix-vector multiply. */ - pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); - PrintDouble5("Mult A*x", N_update, ax); - SUPERLU_FREE(dwork); - } -#endif - - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - - /* Set SAFE1 essentially to be the underflow threshold times the - number of additions in each row. */ - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - - /* Copy X into x on the diagonal processes. */ - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - jj = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; - dx_trs[ii-XK_H] = k;/* Block number prepended in header. */ - } - } - } - /* Copy B into b distributed the same way as matrix-vector product. */ - if ( N_update ) ii = update[0]; - for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); - - /* Compute residual. */ - for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i]; - - /* Compute abs(op(A))*abs(X) + abs(B). */ - pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp); - for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]); - - s = 0.0; - for (i = 0; i < N_update; ++i) { - if ( temp[i] > safe2 ) { - s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); - } else if ( temp[i] != 0.0 ) { - /* Adding SAFE1 to the numerator guards against - spuriously zero residuals (underflow). */ - s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]); - } - /* If temp[i] is exactly 0.0 (computed by PxGSMV), then - we know the true residual also must be exactly 0.0. */ - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - redist_all_to_diag(n, R, Glu_persist, Llu, grid, - mv_sup_to_proc, dx_trs); - pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); - - /* Update solution. */ - for (p = 0; p < num_diag_procs; ++p) - if ( iam == diag_procs[p] ) - for (k = p; k < nsupers; k += num_diag_procs) { - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - knsupc = SuperSize( k ); - for (i = 0; i < knsupc; ++i) - x_trs[i + ii] += dx_trs[i + ii]; - } - lstres = berr[j]; - ++count; - /* Transfer x_trs (on diagonal processes) into X - (on all processes). */ - gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, - num_diag_procs, diag_procs, diag_len, - X_col, temp); - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - - /* Deallocate storage used by matrix-vector multiplication. */ - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - if ( N_update ) { - SUPERLU_FREE(update); - SUPERLU_FREE(bindx); - SUPERLU_FREE(val); - } - SUPERLU_FREE(mv_sup_to_proc); - SUPERLU_FREE(work); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()"); -#endif - -} /* PDGSRFS_ABXGLOBAL */ - - -/* - * r[] is the residual vector distributed the same way as - * matrix-vector product. - */ -static void -redist_all_to_diag(int_t n, double r[], Glu_persist_t *Glu_persist, - LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], - double work[]) -{ - int_t i, ii, k, lk, lr, nsupers; - int_t *ilsum, *xsup; - int iam, knsupc, psrc, pkk; - MPI_Status status; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - lr = 0; - - for (k = 0; k < nsupers; ++k) { - pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); - psrc = mv_sup_to_proc[k]; - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - if ( iam == psrc ) { - if ( iam != pkk ) { /* Send X component. */ - MPI_Send( &r[lr], knsupc, MPI_DOUBLE, pkk, Xk, - grid->comm ); - } else { /* Local copy. */ - for (i = 0; i < knsupc; ++i) - work[i + ii] = r[i + lr]; - } - lr += knsupc; - } else { - if ( iam == pkk ) { /* Recv X component. */ - MPI_Recv( &work[ii], knsupc, MPI_DOUBLE, psrc, Xk, - grid->comm, &status ); - } - } - } -} /* REDIST_ALL_TO_DIAG */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_1rhs_diag_to_all(int_t n, double x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - double y[], double work[]) -{ - int_t i, ii, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii]; - lwork += knsupc; - } - MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p], MPI_DOUBLE, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork]; - lwork += knsupc; - } - } -} /* GATHER_1RHS_DIAG_TO_ALL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,432 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include -#include "superlu_ddefs.h" - -/*-- Function prototypes --*/ -static void gather_1rhs_diag_to_all(int_t, double [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], double [], double []); -static void redist_all_to_diag(int_t, double [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t [], double []); - -void -pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx, - int nrhs, double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pdgsrfs_ABXglobal improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc - * are permutation matrices. The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. - * - * NOTE: Currently, A must reside in all processes when calling - * this routine. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input) double* (global) - * The N-by-NRHS right-hand side matrix of the possibly equilibrated - * and row permuted system. - * - * NOTE: Currently, B must reside on all processes when calling - * this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * X (input/output) double* (global) - * On entry, the solution matrix X, as computed by PDGSTRS. - * On exit, the improved solution matrix X. - * If DiagScale = COL or BOTH, X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * NOTE: Currently, X must reside on all processes when calling - * this routine. - * - * ldx (input) int (global) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - /* - * Data structures used by matrix-vector multiply routine. - */ - int_t N_update; /* Number of variables updated on this process */ - int_t *update; /* vector elements (global index) updated - on this processor. */ - int_t *bindx; - double *val; - int_t *mv_sup_to_proc; /* Supernode to process mapping in - matrix-vector multiply. */ - /*-- end data structures for matrix-vector multiply --*/ - double *b, *ax, *R, *B_col, *temp, *work, *X_col, - *x_trs, *dx_trs; - int_t count, ii, j, jj, k, knsupc, lk, lwork, - nprow, nsupers, nz, p; - int i, iam, pkk; - int_t *ilsum, *xsup; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* NEW STUFF */ - int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ - int_t *diag_len; /* Length of the X vector on diagonal processes. */ - - /*-- Function prototypes --*/ - extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *, - double *, int, SuperLUStat_t *, int *); - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - xerbla_("pdgsrfs_ABXglobal", &i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - /* Initialization. */ - iam = grid->iam; - nprow = grid->nprow; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()"); -#endif - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. number of diag processes = %d\n", num_diag_procs); - PrintInt10("diag_procs", num_diag_procs, diag_procs); - PrintInt10("diag_len", num_diag_procs, diag_len); - } -#endif - - if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for mv_sup_to_proc[]"); - - pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, - &val, &bindx, mv_sup_to_proc); - - i = CEILING( nsupers, nprow ); /* Number of local block rows */ - ii = Llu->ldalsum + i * XK_H; - k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); - jj = SUPERLU_MAX( jj, N_update ); - lwork = N_update /* For ax and R */ - + ii /* For dx_trs */ - + ii /* For x_trs */ - + k /* For b */ - + jj; /* for temp */ - if ( !(work = doubleMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = work; - dx_trs = work + N_update; - x_trs = dx_trs + ii; - b = x_trs + ii; - temp = b + k; - -#if ( DEBUGlevel>=2 ) - { - double *dwork = doubleMalloc_dist(n); - for (i = 0; i < n; ++i) { - if ( i & 1 ) dwork[i] = 1.; - else dwork[i] = 2.; - } - /* Check correctness of matrix-vector multiply. */ - pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); - PrintDouble5("Mult A*x", N_update, ax); - SUPERLU_FREE(dwork); - } -#endif - - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - - /* Copy X into x on the diagonal processes. */ - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - jj = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; - dx_trs[ii-XK_H] = k;/* Block number prepended in header. */ - } - } - } - /* Copy B into b distributed the same way as matrix-vector product. */ - if ( N_update ) ii = update[0]; - for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); - - /* Compute residual. */ - for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i]; - - /* Compute abs(op(A))*abs(X) + abs(B). */ - pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp); - for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]); - - s = 0.0; - for (i = 0; i < N_update; ++i) { - if ( temp[i] > safe2 ) - s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); - else - s = SUPERLU_MAX(s, (fabs(R[i])+safe1)/(temp[i]+safe1)); - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - redist_all_to_diag(n, R, Glu_persist, Llu, grid, - mv_sup_to_proc, dx_trs); - pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); - - /* Update solution. */ - for (p = 0; p < num_diag_procs; ++p) - if ( iam == diag_procs[p] ) - for (k = p; k < nsupers; k += num_diag_procs) { - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - knsupc = SuperSize( k ); - for (i = 0; i < knsupc; ++i) - x_trs[i + ii] += dx_trs[i + ii]; - } - lstres = berr[j]; - ++count; - /* Transfer x_trs (on diagonal processes) into X - (on all processes). */ - gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, - num_diag_procs, diag_procs, diag_len, - X_col, temp); - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - - /* Deallocate storage used by matrix-vector multiplication. */ - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - if ( N_update ) { - SUPERLU_FREE(update); - SUPERLU_FREE(bindx); - SUPERLU_FREE(val); - } - SUPERLU_FREE(mv_sup_to_proc); - SUPERLU_FREE(work); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()"); -#endif - -} /* PDGSRFS_ABXGLOBAL */ - - -/* - * r[] is the residual vector distributed the same way as - * matrix-vector product. - */ -static void -redist_all_to_diag(int_t n, double r[], Glu_persist_t *Glu_persist, - LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], - double work[]) -{ - int_t i, ii, k, lk, lr, nsupers; - int_t *ilsum, *xsup; - int iam, knsupc, psrc, pkk; - MPI_Status status; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - lr = 0; - - for (k = 0; k < nsupers; ++k) { - pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); - psrc = mv_sup_to_proc[k]; - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - if ( iam == psrc ) { - if ( iam != pkk ) { /* Send X component. */ - MPI_Send( &r[lr], knsupc, MPI_DOUBLE, pkk, Xk, - grid->comm ); - } else { /* Local copy. */ - for (i = 0; i < knsupc; ++i) - work[i + ii] = r[i + lr]; - } - lr += knsupc; - } else { - if ( iam == pkk ) { /* Recv X component. */ - MPI_Recv( &work[ii], knsupc, MPI_DOUBLE, psrc, Xk, - grid->comm, &status ); - } - } - } -} /* REDIST_ALL_TO_DIAG */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_1rhs_diag_to_all(int_t n, double x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - double y[], double work[]) -{ - int_t i, ii, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii]; - lwork += knsupc; - } - MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p], MPI_DOUBLE, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork]; - lwork += knsupc; - } - } -} /* GATHER_1RHS_DIAG_TO_ALL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,248 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_ddefs.h" - -void -pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, - double *B, int_t ldb, double *X, int_t ldx, int nrhs, - SOLVEstruct_t *SOLVEstruct, - double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PDGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * ScalePermstruct (input) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_defs.h for the definition of 'gridinfo_t'. - * - * B (input) double* (local) - * The m_loc-by-NRHS right-hand side matrix of the possibly - * equilibrated system. That is, B may be overwritten by diag(R)*B. - * - * ldb (input) int (local) - * Leading dimension of matrix B. - * - * X (input/output) double* (local) - * On entry, the solution matrix Y, as computed by PDGSTRS, of the - * transformed system A1*Y = Pc*Pr*B. where - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X. - * On exit, the improved solution matrix Y. - * - * In order to obtain the solution X to the original system, - * Y should be permutated by Pc^T, and premultiplied by diag(C) - * if DiagScale = COL or BOTH. - * This must be done after this routine is called. - * - * ldx (input) int (local) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * SOLVEstruct (output) SOLVEstruct_t* (global) - * Contains the information for the communication during the - * solution phase. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double *ax, *R, *dx, *temp, *work, *B_col, *X_col; - int_t count, i, j, lwork, nz; - int iam; - double eps, lstres; - double s, safmin, safe1, safe2; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(char *); - - /* Data structures used by matrix-vector multiply routine. */ - pdgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; - NRformat_loc *Astore; - int_t m_loc, fst_row; - - - /* Initialization. */ - Astore = (NRformat_loc *) A->Store; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - iam = grid->iam; - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - pxerbla("PDGSRFS", grid, i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgsrfs()"); -#endif - - lwork = 2 * m_loc; /* For ax/R/dx and temp */ - if ( !(work = doubleMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = dx = work; - temp = ax + m_loc; - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - /* eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); */ - eps = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Epsilon"); - safmin = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum"); - - /* Set SAFE1 essentially to be the underflow threshold times the - number of additions in each row. */ - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pdgsmv(0, A, grid, gsmv_comm, X_col, ax); - - /* Compute residual, stored in R[]. */ - for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i]; - - /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ - pdgsmv(1, A, grid, gsmv_comm, X_col, temp); - for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]); - - s = 0.0; - for (i = 0; i < m_loc; ++i) { - if ( temp[i] > safe2 ) { - s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); - } else if ( temp[i] != 0.0 ) { - /* Adding SAFE1 to the numerator guards against - spuriously zero residuals (underflow). */ - s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]); - } - /* If temp[i] is exactly 0.0 (computed by PxGSMV), then - we know the true residual also must be exactly 0.0. */ - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - pdgstrs(n, LUstruct, ScalePermstruct, grid, - dx, m_loc, fst_row, m_loc, 1, - SOLVEstruct, stat, info); - - /* Update solution. */ - for (i = 0; i < m_loc; ++i) X_col[i] += dx[i]; - - lstres = berr[j]; - ++count; - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - /* Deallocate storage. */ - SUPERLU_FREE(work); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgsrfs()"); -#endif - -} /* PDGSRFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,237 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_ddefs.h" - -void -pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, - double *B, int_t ldb, double *X, int_t ldx, int nrhs, - SOLVEstruct_t *SOLVEstruct, - double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PDGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * ScalePermstruct (input) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_defs.h for the definition of 'gridinfo_t'. - * - * B (input) double* (local) - * The m_loc-by-NRHS right-hand side matrix of the possibly - * equilibrated system. That is, B may be overwritten by diag(R)*B. - * - * ldb (input) int (local) - * Leading dimension of matrix B. - * - * X (input/output) double* (local) - * On entry, the solution matrix Y, as computed by PDGSTRS, of the - * transformed system A1*Y = Pc*Pr*B. where - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X. - * On exit, the improved solution matrix Y. - * - * In order to obtain the solution X to the original system, - * Y should be permutated by Pc^T, and premultiplied by diag(C) - * if DiagScale = COL or BOTH. - * This must be done after this routine is called. - * - * ldx (input) int (local) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * SOLVEstruct (output) SOLVEstruct_t* (global) - * Contains the information for the communication during the - * solution phase. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double *ax, *R, *dx, *temp, *work, *B_col, *X_col; - int_t count, i, j, lwork, nz; - int iam; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* Data structures used by matrix-vector multiply routine. */ - pdgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; - NRformat_loc *Astore; - int_t m_loc, fst_row; - - - /* Initialization. */ - Astore = (NRformat_loc *) A->Store; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - iam = grid->iam; - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - pxerbla("PDGSRFS", grid, i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgsrfs()"); -#endif - - lwork = 2 * m_loc; /* For ax/R/dx and temp */ - if ( !(work = doubleMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = dx = work; - temp = ax + m_loc; - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pdgsmv(0, A, grid, gsmv_comm, X_col, ax); - - /* Compute residual, stored in R[]. */ - for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i]; - - /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ - pdgsmv(1, A, grid, gsmv_comm, X_col, temp); - for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]); - - s = 0.0; - for (i = 0; i < m_loc; ++i) { - if ( temp[i] > safe2 ) - s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); - else - s = SUPERLU_MAX(s, (fabs(R[i]) + safe1)/(temp[i]+safe1)); - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - pdgstrs(n, LUstruct, ScalePermstruct, grid, - dx, m_loc, fst_row, m_loc, 1, - SOLVEstruct, stat, info); - - /* Update solution. */ - for (i = 0; i < m_loc; ++i) X_col[i] += dx[i]; - - lstres = berr[j]; - ++count; - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - /* Deallocate storage. */ - SUPERLU_FREE(work); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgsrfs()"); -#endif - -} /* PDGSRFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1065 +0,0 @@ - - -#include -#include "superlu_ddefs.h" - - -void -pdgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - double B[], int ldb, int nrhs, gridinfo_t *grid, - LUstruct_t *LUstruct, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * - * pdgssvx_ABglobal solves a system of linear equations A*X=B, - * by using Gaussian elimination with "static pivoting" to - * compute the LU factorization of A. - * - * Static pivoting is a technique that combines the numerical stability - * of partial pivoting with the scalability of Cholesky (no pivoting), - * to run accurately and efficiently on large numbers of processors. - * - * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed - * description of the parallel algorithms. - * - * Here are the options for using this code: - * - * 1. Independent of all the other options specified below, the - * user must supply - * - * - B, the matrix of right hand sides, and its dimensions ldb and nrhs - * - grid, a structure describing the 2D processor mesh - * - options->IterRefine, which determines whether or not to - * improve the accuracy of the computed solution using - * iterative refinement - * - * On output, B is overwritten with the solution X. - * - * 2. Depending on options->Fact, the user has several options - * for solving A*X=B. The standard option is for factoring - * A "from scratch". (The other options, described below, - * are used when A is sufficiently similar to a previously - * solved problem to save time by reusing part or all of - * the previous factorization.) - * - * - options->Fact = DOFACT: A is factored "from scratch" - * - * In this case the user must also supply - * - * - A, the input matrix - * - * as well as the following options, which are described in more - * detail below: - * - * - options->Equil, to specify how to scale the rows and columns - * of A to "equilibrate" it (to try to reduce its - * condition number and so improve the - * accuracy of the computed solution) - * - * - options->RowPerm, to specify how to permute the rows of A - * (typically to control numerical stability) - * - * - options->ColPerm, to specify how to permute the columns of A - * (typically to control fill-in and enhance - * parallelism during factorization) - * - * - options->ReplaceTinyPivot, to specify how to deal with tiny - * pivots encountered during factorization - * (to control numerical stability) - * - * The outputs returned include - * - * - ScalePermstruct, modified to describe how the input matrix A - * was equilibrated and permuted: - * - ScalePermstruct->DiagScale, indicates whether the rows and/or - * columns of A were scaled - * - ScalePermstruct->R, array of row scale factors - * - ScalePermstruct->C, array of column scale factors - * - ScalePermstruct->perm_r, row permutation vector - * - ScalePermstruct->perm_c, column permutation vector - * - * (part of ScalePermstruct may also need to be supplied on input, - * depending on options->RowPerm and options->ColPerm as described - * later). - * - * - A, the input matrix A overwritten by the scaled and permuted matrix - * Pc*Pr*diag(R)*A*diag(C) - * where - * Pr and Pc are row and columns permutation matrices determined - * by ScalePermstruct->perm_r and ScalePermstruct->perm_c, - * respectively, and - * diag(R) and diag(C) are diagonal scaling matrices determined - * by ScalePermstruct->DiagScale, ScalePermstruct->R and - * ScalePermstruct->C - * - * - LUstruct, which contains the L and U factorization of A1 where - * - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * (Note that A1 = Aout * Pc^T, where Aout is the matrix stored - * in A on output.) - * - * 3. The second value of options->Fact assumes that a matrix with the same - * sparsity pattern as A has already been factored: - * - * - options->Fact = SamePattern: A is factored, assuming that it has - * the same nonzero pattern as a previously factored matrix. In this - * case the algorithm saves time by reusing the previously computed - * column permutation vector stored in ScalePermstruct->perm_c - * and the "elimination tree" of A stored in LUstruct->etree. - * - * In this case the user must still specify the following options - * as before: - * - * - options->Equil - * - options->RowPerm - * - options->ReplaceTinyPivot - * - * but not options->ColPerm, whose value is ignored. This is because the - * previous column permutation from ScalePermstruct->perm_c is used as - * input. The user must also supply - * - * - A, the input matrix - * - ScalePermstruct->perm_c, the column permutation - * - LUstruct->etree, the elimination tree - * - * The outputs returned include - * - * - A, the input matrix A overwritten by the scaled and permuted matrix - * as described above - * - ScalePermstruct, modified to describe how the input matrix A was - * equilibrated and row permuted - * - LUstruct, modified to contain the new L and U factors - * - * 4. The third value of options->Fact assumes that a matrix B with the same - * sparsity pattern as A has already been factored, and where the - * row permutation of B can be reused for A. This is useful when A and B - * have similar numerical values, so that the same row permutation - * will make both factorizations numerically stable. This lets us reuse - * all of the previously computed structure of L and U. - * - * - options->Fact = SamePattern_SameRowPerm: A is factored, - * assuming not only the same nonzero pattern as the previously - * factored matrix B, but reusing B's row permutation. - * - * In this case the user must still specify the following options - * as before: - * - * - options->Equil - * - options->ReplaceTinyPivot - * - * but not options->RowPerm or options->ColPerm, whose values are ignored. - * This is because the permutations from ScalePermstruct->perm_r and - * ScalePermstruct->perm_c are used as input. - * - * The user must also supply - * - * - A, the input matrix - * - ScalePermstruct->DiagScale, how the previous matrix was row and/or - * column scaled - * - ScalePermstruct->R, the row scalings of the previous matrix, if any - * - ScalePermstruct->C, the columns scalings of the previous matrix, - * if any - * - ScalePermstruct->perm_r, the row permutation of the previous matrix - * - ScalePermstruct->perm_c, the column permutation of the previous - * matrix - * - all of LUstruct, the previously computed information about L and U - * (the actual numerical values of L and U stored in - * LUstruct->Llu are ignored) - * - * The outputs returned include - * - * - A, the input matrix A overwritten by the scaled and permuted matrix - * as described above - * - ScalePermstruct, modified to describe how the input matrix A was - * equilibrated - * (thus ScalePermstruct->DiagScale, R and C may be modified) - * - LUstruct, modified to contain the new L and U factors - * - * 5. The fourth and last value of options->Fact assumes that A is - * identical to a matrix that has already been factored on a previous - * call, and reuses its entire LU factorization - * - * - options->Fact = Factored: A is identical to a previously - * factorized matrix, so the entire previous factorization - * can be reused. - * - * In this case all the other options mentioned above are ignored - * (options->Equil, options->RowPerm, options->ColPerm, - * options->ReplaceTinyPivot) - * - * The user must also supply - * - * - A, the unfactored matrix, only in the case that iterative refinment - * is to be done (specifically A must be the output A from - * the previous call, so that it has been scaled and permuted) - * - all of ScalePermstruct - * - all of LUstruct, including the actual numerical values of L and U - * - * all of which are unmodified on output. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following fields should be defined for this structure: - * - * o Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorized based on the previous history. - * - * = DOFACT: The matrix A will be factorized from scratch. - * Inputs: A - * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * all of ScalePermstruct - * all of LUstruct - * - * = SamePattern: the matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the elimination tree - * LUstruct->etree - * Inputs: A - * options->Equil, RowPerm, ReplaceTinyPivot - * ScalePermstruct->perm_c - * LUstruct->etree - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * rest of ScalePermstruct (DiagScale, R, C, perm_r) - * rest of LUstruct (GLU_persist, Llu) - * - * = SamePattern_SameRowPerm: the matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * Inputs: A - * options->Equil, ReplaceTinyPivot - * all of ScalePermstruct - * all of LUstruct - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * modified LUstruct->Llu - * = FACTORED: the matrix A is already factored. - * Inputs: all of ScalePermstruct - * all of LUstruct - * - * o Equil (yes_no_t) - * Specifies whether to equilibrate the system. - * = NO: no equilibration. - * = YES: scaling factors are computed to equilibrate the system: - * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. - * Whether or not the system will be equilibrated depends - * on the scaling of the matrix A, but if equilibration is - * used, A is overwritten by diag(R)*A*diag(C) and B by - * diag(R)*B. - * - * o RowPerm (rowperm_t) - * Specifies how to permute rows of the matrix A. - * = NATURAL: use the natural ordering. - * = LargeDiag: use the Duff/Koster algorithm to permute rows of - * the original matrix to make the diagonal large - * relative to the off-diagonal. - * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r - * input by the user. - * - * o ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering. - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. - * = MMD_ATA: minimum degree ordering on structure of A'*A. - * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. - * - * o ReplaceTinyPivot (yes_no_t) - * = NO: do not modify pivots - * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during - * LU factorization. - * - * o IterRefine (IterRefine_t) - * Specifies how to perform iterative refinement. - * = NO: no iterative refinement. - * = DOUBLE: accumulate residual in double precision. - * = EXTRA: accumulate residual in extra precision. - * - * NOTE: all options must be indentical on all processes when - * calling this routine. - * - * A (input/output) SuperMatrix* - * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A must be: - * Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. That is, A is stored in - * compressed column format (also known as Harwell-Boeing format). - * See supermatrix.h for the definition of 'SuperMatrix'. - * This routine only handles square A, however, the LU factorization - * routine pdgstrf can factorize rectangular matrices. - * On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C), - * depending on ScalePermstruct->DiagScale, options->RowPerm and - * options->colpem: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by - * diag(R)*A*diag(C). - * if options->RowPerm != NATURAL, A is further overwritten by - * Pr*diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by - * Pc*Pr*diag(R)*A*diag(C). - * If all the above condition are true, the LU decomposition is - * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. - * - * NOTE: Currently, A must reside in all processes when calling - * this routine. - * - * ScalePermstruct (input/output) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * It contains the following fields: - * - * o DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: no equilibration. - * = ROW: row equilibration, i.e., A was premultiplied by - * diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied - * by diag(C). - * = BOTH: both row and column equilibration, i.e., A was - * replaced by diag(R)*A*diag(C). - * If options->Fact = FACTORED or SamePattern_SameRowPerm, - * DiagScale is an input argument; otherwise it is an output - * argument. - * - * o perm_r (int*) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->RowPerm = MY_PERMR, or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument; otherwise it is an output argument. - * - * o perm_c (int*) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern - * or options->Fact = SamePattern_SameRowPerm, perm_c is an - * input argument; otherwise, it is an output argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree - * is already in postorder. - * - * o R (double*) dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by - * diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is - * an input argument; otherwise, R is an output argument. - * - * o C (double*) dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by - * diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is - * an input argument; otherwise, C is an output argument. - * - * B (input/output) double* - * On entry, the right-hand side matrix of dimension (A->nrow, nrhs). - * On exit, the solution matrix if info = 0; - * - * NOTE: Currently, B must reside in all processes when calling - * this routine. - * - * ldb (input) int (global) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * The number of right-hand sides. - * If nrhs = 0, only LU decomposition is performed, the forward - * and back substitutions are skipped. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * It contains the following fields: - * - * o etree (int*) dimension (A->ncol) - * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol. - * It is computed in sp_colorder() during the first factorization, - * and is reused in the subsequent factorizations of the matrices - * with the same nonzero pattern. - * On exit of sp_colorder(), the columns of A are permuted so that - * the etree is in a certain postorder. This postorder is reflected - * in ScalePermstruct->perm_c. - * NOTE: - * Etree is a vector of parent pointers for a forest whose vertices - * are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * o Glu_persist (Glu_persist_t*) - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (LocalLU_t*) - * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - * - * See superlu_ddefs.h for the definitions of various data types. - * - */ - SuperMatrix AC; - NCformat *Astore; - NCPformat *ACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by DDISTRIBUTE - routine. They will be freed after DDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - fact_t Fact; - double *a; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *colptr, *rowind; - int_t colequ, Equil, factored, job, notran, rowequ; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; - int iam; - int ldx; /* LDA for matrix X (global). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - double *X, *b_col, *b_work, *x_col; - double t; - static mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; -#endif - - /* Test input parameters. */ - *info = 0; - Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) - *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) - *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) - *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) - *info = -1; - else if ( options->IterRefine == EXTRA ) { - *info = -1; - fprintf(stderr, "Extra precise iterative refinement yet to support."); - } else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < A->nrow ) - *info = -5; - else if ( nrhs < 0 ) - *info = -6; - if ( *info ) { - i = -(*info); - pxerbla("pdgssvx_ABglobal", grid, -*info); - return; - } - - /* Initialization */ - factored = (Fact == FACTORED); - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - iam = grid->iam; - job = 5; - m = A->nrow; - n = A->ncol; - Astore = A->Store; - nnz = Astore->nnz; - a = Astore->nzval; - colptr = Astore->colptr; - rowind = Astore->rowind; - if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } else rowequ = colequ = FALSE; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgssvx_ABglobal()"); -#endif - - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - if ( Equil && Fact != SamePattern_SameRowPerm ) { - /* Allocate storage if not done so before. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------*/ - if ( Equil ) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm ) { - /* Reuse R and C. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - break; - case ROW: - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - a[i] *= R[irow]; /* Scale rows. */ - } - } - break; - case COL: - for (j = 0; j < n; ++j) - for (i = colptr[j]; i < colptr[j+1]; ++i) - a[i] *= C[j]; /* Scale columns. */ - break; - case BOTH: - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - a[i] *= R[irow] * C[j]; /* Scale rows and columns. */ - } - } - break; - } - } else { - if ( !iam ) { - /* Compute row and column scalings to equilibrate matrix A. */ - dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); - - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) { - MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); - } else { - if ( iinfo > 0 ) { - if ( iinfo <= m ) - fprintf(stderr, "The %d-th row of A is exactly zero\n", - iinfo); - else fprintf(stderr, "The %d-th column of A is exactly zero\n", - iinfo-n); - exit(-1); - } - } - } else { - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) { - MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); - } else { - ABORT("DGSEQU failed\n"); - } - } - - /* Equilibrate matrix A. */ - dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); - if ( lsame_(equed, "R") ) { - ScalePermstruct->DiagScale = rowequ = ROW; - } else if ( lsame_(equed, "C") ) { - ScalePermstruct->DiagScale = colequ = COL; - } else if ( lsame_(equed, "B") ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. equilibrated? *equed = %c\n", *equed); - /*fflush(stdout);*/ - } -#endif - } /* if Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit equil"); -#endif - } /* end if Equil ... */ - - /* ------------------------------------------------------------ - Permute rows of A. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ - || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ - for (i = 0; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } else if ( !factored ) { - if ( job == 5 ) { - /* Allocate storage for scaling factors. */ - if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if ( !iam ) { - /* Process 0 finds a row permutation for large diagonal. */ - dldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1); - - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } else { - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } - -#if ( PRNTlevel>=2 ) - dmin = dlamch_("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if ( job == 5 ) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */ - rowind[i] = perm_r[irow]; -#if ( PRNTlevel>=2 ) - if ( rowind[i] == j ) /* New diagonal */ - dprod *= fabs(a[i]); -#endif - } - } - - /* Multiply together the scaling factors. */ - if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; - else for (i = 0; i < m; ++i) R[i] = R1[i]; - if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; - else for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - } else { /* No equilibration. */ - for (i = colptr[0]; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; -#if ( PRNTlevel>=2 ) - if ( rowind[i] == j ) { /* New diagonal */ - if ( job == 2 || job == 3 ) - dmin = SUPERLU_MIN(dmin, fabs(a[i])); - else if ( job == 4 ) - dsum += fabs(a[i]); - else if ( job == 5 ) - dprod *= fabs(a[i]); - } -#endif - } - } - } - -#if ( PRNTlevel>=2 ) - if ( job == 2 || job == 3 ) { - if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); - } else if ( job == 4 ) { - if ( !iam ) printf("\tsum of diagonal %e\n", dsum); - } else if ( job == 5 ) { - if ( !iam ) printf("\t product of diagonal %e\n", dprod); - } -#endif - - } /* else !factored */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); -#endif - - } else { /* options->RowPerm == NOROWPERM */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - - if ( !factored || options->IterRefine ) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if ( notran ) *(unsigned char *)norm = '1'; - else *(unsigned char *)norm = 'I'; - anorm = dlangs_dist(norm, A); -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. anorm %e\n", anorm); -#endif - } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------*/ - if ( !factored ) { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && Fact == DOFACT ) - /* Use an ordering provided by SuperLU */ - get_perm_c_dist(iam, permc_spec, A, perm_c); - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - sp_colorder(options, A, perm_c, etree, &AC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ - ACstore = AC.Store; - for (j = 0; j < n; ++j) - for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { - irow = ACstore->rowind[i]; - ACstore->rowind[i] = perm_c[irow]; - } - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Perform a symbolic factorization on matrix A and set up the - nonzero data structures which are suitable for supernodal GENP. */ - if ( Fact != SamePattern_SameRowPerm ) { -#if ( PRNTlevel>=1 ) - if ( !iam ) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); -#endif - t = SuperLU_timer_(); - if ( !(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) - ABORT("Malloc fails for Glu_freeable."); - - iinfo = symbfact(options, iam, &AC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - - if ( iinfo < 0 ) { - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); - printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu*1e-6, - symb_mem_usage.total*1e-6, - symb_mem_usage.expansions); - } -#endif - } else { - if ( !iam ) { - fprintf(stderr, "symbfact() error returns %d\n", iinfo); - exit(-1); - } - } - } - - /* Distribute the L and U factors onto the process grid. */ - t = SuperLU_timer_(); - dist_mem_use = ddistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factor. */ - if ( Fact != SamePattern_SameRowPerm ) { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } - - /* Perform numerical factorization in parallel. */ - t = SuperLU_timer_(); - pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; - -#if ( PRNTlevel>=1 ) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); - MPI_Reduce( &num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - temp = SUPERLU_MAX(symb_mem_usage.total, - symb_mem_usage.for_lu + - (float)dist_mem_use + num_mem_usage.for_lu); - temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce( &temp, &max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &temp, &avg, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm ); - stat->TinyPivots = TinyPivots; - if ( !iam ) { - printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", - for_lu*1e-6, total*1e-6); - printf("\tAll space (MB):" - "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); - printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); - } - } -#endif - -#if ( PRNTlevel>=2 ) - if ( !iam ) printf(".. pdgstrf INFO = %d\n", *info); -#endif - - } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ - /* Permute columns of A to form A*Pc' using the existing perm_c. - * NOTE: rows of A were previously permuted to Pc*A. - */ - sp_colorder(options, A, perm_c, NULL, &AC); - } /* if !factored ... */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------*/ - if ( nrhs ) { - - if ( !(b_work = doubleMalloc_dist(n)) ) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------------ - Scale the right-hand side if equilibration was performed. - ------------------------------------------------------------*/ - if ( notran ) { - if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) b_col[i] *= R[i]; - b_col += ldb; - } - } - } else if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) b_col[i] *= C[i]; - b_col += ldb; - } - } - - /* ------------------------------------------------------------ - Permute the right-hand side to form Pr*B. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - if ( notran ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; - for (i = 0; i < m; ++i) b_col[i] = b_work[i]; - b_col += ldb; - } - } - } - - - /* ------------------------------------------------------------ - Permute the right-hand side to form Pc*B. - ------------------------------------------------------------*/ - if ( notran ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; - for (i = 0; i < m; ++i) b_col[i] = b_work[i]; - b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) - ABORT("Malloc fails for X[]"); - x_col = X; b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; - x_col += ldx; b_col += ldb; - } - - /* ------------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------------*/ - pdgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); - - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------*/ - if ( options->IterRefine ) { - /* Improve the solution by iterative refinement. */ - t = SuperLU_timer_(); - pdgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, - X, ldx, nrhs, berr, stat, info); - stat->utime[REFINE] = SuperLU_timer_() - t; - } - - /* Permute the solution matrix X <= Pc'*X. */ - for (j = 0; j < nrhs; j++) { - b_col = &B[j*ldb]; - x_col = &X[j*ldx]; - for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; - } - - /* Transform the solution matrix X to a solution of the original system - before the equilibration. */ - if ( notran ) { - if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < n; ++i) b_col[i] *= C[i]; - b_col += ldb; - } - } - } else if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < n; ++i) b_col[i] *= R[i]; - b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); -#endif - - /* Deallocate R and/or C if it is not used. */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - } - } - if ( !factored || (factored && options->IterRefine) ) - Destroy_CompCol_Permuted_dist(&AC); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgssvx_ABglobal()"); -#endif -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1283 +0,0 @@ - - -#include -#include "superlu_ddefs.h" - -void -pdgssvx(superlu_options_t *options, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - double B[], int ldb, int nrhs, gridinfo_t *grid, - LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * -- Distributed SuperLU routine (version 2.2) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * Feburary 20, 2008 - * - * - * Purpose - * ======= - * - * PDGSSVX solves a system of linear equations A*X=B, - * by using Gaussian elimination with "static pivoting" to - * compute the LU factorization of A. - * - * Static pivoting is a technique that combines the numerical stability - * of partial pivoting with the scalability of Cholesky (no pivoting), - * to run accurately and efficiently on large numbers of processors. - * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed - * description of the parallel algorithms. - * - * The input matrices A and B are distributed by block rows. - * Here is a graphical illustration (0-based indexing): - * - * A B - * 0 --------------- ------ - * | | | | - * | | P0 | | - * | | | | - * --------------- ------ - * - fst_row->| | | | - * | | | | | - * m_loc | | P1 | | - * | | | | | - * - | | | | - * --------------- ------ - * | . | |. | - * | . | |. | - * | . | |. | - * --------------- ------ - * - * where, fst_row is the row number of the first row, - * m_loc is the number of rows local to this processor - * These are defined in the 'SuperMatrix' structure, see supermatrix.h. - * - * - * Here are the options for using this code: - * - * 1. Independent of all the other options specified below, the - * user must supply - * - * - B, the matrix of right-hand sides, distributed by block rows, - * and its dimensions ldb (local) and nrhs (global) - * - grid, a structure describing the 2D processor mesh - * - options->IterRefine, which determines whether or not to - * improve the accuracy of the computed solution using - * iterative refinement - * - * On output, B is overwritten with the solution X. - * - * 2. Depending on options->Fact, the user has four options - * for solving A*X=B. The standard option is for factoring - * A "from scratch". (The other options, described below, - * are used when A is sufficiently similar to a previously - * solved problem to save time by reusing part or all of - * the previous factorization.) - * - * - options->Fact = DOFACT: A is factored "from scratch" - * - * In this case the user must also supply - * - * o A, the input matrix - * - * as well as the following options to determine what matrix to - * factorize. - * - * o options->Equil, to specify how to scale the rows and columns - * of A to "equilibrate" it (to try to reduce its - * condition number and so improve the - * accuracy of the computed solution) - * - * o options->RowPerm, to specify how to permute the rows of A - * (typically to control numerical stability) - * - * o options->ColPerm, to specify how to permute the columns of A - * (typically to control fill-in and enhance - * parallelism during factorization) - * - * o options->ReplaceTinyPivot, to specify how to deal with tiny - * pivots encountered during factorization - * (to control numerical stability) - * - * The outputs returned include - * - * o ScalePermstruct, modified to describe how the input matrix A - * was equilibrated and permuted: - * . ScalePermstruct->DiagScale, indicates whether the rows and/or - * columns of A were scaled - * . ScalePermstruct->R, array of row scale factors - * . ScalePermstruct->C, array of column scale factors - * . ScalePermstruct->perm_r, row permutation vector - * . ScalePermstruct->perm_c, column permutation vector - * - * (part of ScalePermstruct may also need to be supplied on input, - * depending on options->RowPerm and options->ColPerm as described - * later). - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix diag(R)*A*diag(C)*Pc^T, where - * Pc is the row permutation matrix determined by - * ScalePermstruct->perm_c - * diag(R) and diag(C) are diagonal scaling matrices determined - * by ScalePermstruct->DiagScale, ScalePermstruct->R and - * ScalePermstruct->C - * - * o LUstruct, which contains the L and U factorization of A1 where - * - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored - * in A on output.) - * - * 3. The second value of options->Fact assumes that a matrix with the same - * sparsity pattern as A has already been factored: - * - * - options->Fact = SamePattern: A is factored, assuming that it has - * the same nonzero pattern as a previously factored matrix. In - * this case the algorithm saves time by reusing the previously - * computed column permutation vector stored in - * ScalePermstruct->perm_c and the "elimination tree" of A - * stored in LUstruct->etree - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->RowPerm - * o options->ReplaceTinyPivot - * - * but not options->ColPerm, whose value is ignored. This is because the - * previous column permutation from ScalePermstruct->perm_c is used as - * input. The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->perm_c, the column permutation - * o LUstruct->etree, the elimination tree - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated and row permuted - * o LUstruct, modified to contain the new L and U factors - * - * 4. The third value of options->Fact assumes that a matrix B with the same - * sparsity pattern as A has already been factored, and where the - * row permutation of B can be reused for A. This is useful when A and B - * have similar numerical values, so that the same row permutation - * will make both factorizations numerically stable. This lets us reuse - * all of the previously computed structure of L and U. - * - * - options->Fact = SamePattern_SameRowPerm: A is factored, - * assuming not only the same nonzero pattern as the previously - * factored matrix B, but reusing B's row permutation. - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->ReplaceTinyPivot - * - * but not options->RowPerm or options->ColPerm, whose values are - * ignored. This is because the permutations from ScalePermstruct->perm_r - * and ScalePermstruct->perm_c are used as input. - * - * The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->DiagScale, how the previous matrix was row - * and/or column scaled - * o ScalePermstruct->R, the row scalings of the previous matrix, - * if any - * o ScalePermstruct->C, the columns scalings of the previous matrix, - * if any - * o ScalePermstruct->perm_r, the row permutation of the previous - * matrix - * o ScalePermstruct->perm_c, the column permutation of the previous - * matrix - * o all of LUstruct, the previously computed information about - * L and U (the actual numerical values of L and U - * stored in LUstruct->Llu are ignored) - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated (thus ScalePermstruct->DiagScale, - * R and C may be modified) - * o LUstruct, modified to contain the new L and U factors - * - * 5. The fourth and last value of options->Fact assumes that A is - * identical to a matrix that has already been factored on a previous - * call, and reuses its entire LU factorization - * - * - options->Fact = Factored: A is identical to a previously - * factorized matrix, so the entire previous factorization - * can be reused. - * - * In this case all the other options mentioned above are ignored - * (options->Equil, options->RowPerm, options->ColPerm, - * options->ReplaceTinyPivot) - * - * The user must also supply - * - * o A, the unfactored matrix, only in the case that iterative - * refinment is to be done (specifically A must be the output - * A from the previous call, so that it has been scaled and permuted) - * o all of ScalePermstruct - * o all of LUstruct, including the actual numerical values of - * L and U - * - * all of which are unmodified on output. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* (global) - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following fields should be defined for this structure: - * - * o Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorized based on the previous history. - * - * = DOFACT: The matrix A will be factorized from scratch. - * Inputs: A - * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * all of ScalePermstruct - * all of LUstruct - * - * = SamePattern: the matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the elimination tree - * LUstruct->etree - * Inputs: A - * options->Equil, RowPerm, ReplaceTinyPivot - * ScalePermstruct->perm_c - * LUstruct->etree - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * rest of ScalePermstruct (DiagScale, R, C, perm_r) - * rest of LUstruct (GLU_persist, Llu) - * - * = SamePattern_SameRowPerm: the matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * Inputs: A - * options->Equil, ReplaceTinyPivot - * all of ScalePermstruct - * all of LUstruct - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * modified LUstruct->Llu - * = FACTORED: the matrix A is already factored. - * Inputs: all of ScalePermstruct - * all of LUstruct - * - * o Equil (yes_no_t) - * Specifies whether to equilibrate the system. - * = NO: no equilibration. - * = YES: scaling factors are computed to equilibrate the system: - * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. - * Whether or not the system will be equilibrated depends - * on the scaling of the matrix A, but if equilibration is - * used, A is overwritten by diag(R)*A*diag(C) and B by - * diag(R)*B. - * - * o RowPerm (rowperm_t) - * Specifies how to permute rows of the matrix A. - * = NATURAL: use the natural ordering. - * = LargeDiag: use the Duff/Koster algorithm to permute rows of - * the original matrix to make the diagonal large - * relative to the off-diagonal. - * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r - * input by the user. - * - * o ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering. - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. - * = MMD_ATA: minimum degree ordering on structure of A'*A. - * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. - * - * o ReplaceTinyPivot (yes_no_t) - * = NO: do not modify pivots - * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during - * LU factorization. - * - * o IterRefine (IterRefine_t) - * Specifies how to perform iterative refinement. - * = NO: no iterative refinement. - * = DOUBLE: accumulate residual in double precision. - * = EXTRA: accumulate residual in extra precision. - * - * NOTE: all options must be indentical on all processes when - * calling this routine. - * - * A (input/output) SuperMatrix* (local) - * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A must be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * That is, A is stored in distributed compressed row format. - * See supermatrix.h for the definition of 'SuperMatrix'. - * This routine only handles square A, however, the LU factorization - * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, - * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by - * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by - * diag(R)*A*diag(C)*Pc^T. - * If all the above condition are true, the LU decomposition is - * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. - * - * ScalePermstruct (input/output) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * It contains the following fields: - * - * o DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: no equilibration. - * = ROW: row equilibration, i.e., A was premultiplied by - * diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied - * by diag(C). - * = BOTH: both row and column equilibration, i.e., A was - * replaced by diag(R)*A*diag(C). - * If options->Fact = FACTORED or SamePattern_SameRowPerm, - * DiagScale is an input argument; otherwise it is an output - * argument. - * - * o perm_r (int*) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->RowPerm = MY_PERMR, or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument; otherwise it is an output argument. - * - * o perm_c (int*) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern - * or options->Fact = SamePattern_SameRowPerm, perm_c is an - * input argument; otherwise, it is an output argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree - * is already in postorder. - * - * o R (double*) dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by - * diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is - * an input argument; otherwise, R is an output argument. - * - * o C (double*) dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by - * diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is - * an input argument; otherwise, C is an output argument. - * - * B (input/output) double* (local) - * On entry, the right-hand side matrix of dimension (m_loc, nrhs), - * where, m_loc is the number of rows stored locally on my - * process and is defined in the data structure of matrix A. - * On exit, the solution matrix if info = 0; - * - * ldb (input) int (local) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * The number of right-hand sides. - * If nrhs = 0, only LU decomposition is performed, the forward - * and back substitutions are skipped. - * - * grid (input) gridinfo_t* (global) - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * It contains the following fields: - * - * o etree (int*) dimension (A->ncol) (global) - * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. - * It is computed in sp_colorder() during the first factorization, - * and is reused in the subsequent factorizations of the matrices - * with the same nonzero pattern. - * On exit of sp_colorder(), the columns of A are permuted so that - * the etree is in a certain postorder. This postorder is reflected - * in ScalePermstruct->perm_c. - * NOTE: - * Etree is a vector of parent pointers for a forest whose vertices - * are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * o Glu_persist (Glu_persist_t*) (global) - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (LocalLU_t*) (local) - * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. - * - * SOLVEstruct (input/output) SOLVEstruct_t* - * The data structure to hold the communication pattern used - * in the phases of triangular solution and iterative refinement. - * This pattern should be intialized only once for repeated solutions. - * If options->SolveInitialized = YES, it is an input argument. - * If options->SolveInitialized = NO and nrhs != 0, it is an output - * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. - * - * berr (output) double*, dimension (nrhs) (global) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - * See superlu_ddefs.h for the definitions of varioous data types. - * - */ - NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ - NCformat *GAstore; - double *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by PDDISTRIBUTE - routine. They will be freed after PDDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - fact_t Fact; - double *a; - int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR*/ - int_t *rowind_loc, *colptr_loc; - int_t colequ, Equil, factored, job, notran, rowequ, need_value; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; - int_t nnz_loc, m_loc, fst_row, icol; - int iam; - int ldx; /* LDA for matrix X (local). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - double *X, *b_col, *b_work, *x_col; - double t; - static mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; -#endif - int_t procs; - - /* Structures needed for parallel symbolic factorization */ - int_t *sizes, *fstVtxSep, parSymbFact; - int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ - Pslu_freeable_t Pslu_freeable; - float flinfo; - - /* Initialization. */ - m = A->nrow; - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = (double *) Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - sizes = NULL; - fstVtxSep = NULL; - symb_comm = MPI_COMM_NULL; - - /* Test the input parameters. */ - *info = 0; - Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) - *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) - *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) - *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) - *info = -1; - else if ( options->IterRefine == EXTRA ) { - *info = -1; - fprintf(stderr, "Extra precise iterative refinement yet to support."); - } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < m_loc ) - *info = -5; - else if ( nrhs < 0 ) - *info = -6; - if ( *info ) { - i = -(*info); - pxerbla("pdgssvx", grid, -*info); - return; - } - - factored = (Fact == FACTORED); - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - iam = grid->iam; - job = 5; - if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } else rowequ = colequ = FALSE; - - /* The following arrays are replicated on all processes. */ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - /********/ - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgssvx()"); -#endif - - /* Not factored & ask for equilibration */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - /* Allocate storage if not done so before. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------*/ - if ( Equil ) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm ) { - /* Reuse R and C. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - a[i] *= R[irow]; /* Scale rows. */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j+1]; ++i){ - icol = colind[i]; - a[i] *= C[icol]; /* Scale columns. */ - } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ - } - ++irow; - } - break; - } - } else { /* Compute R & C from scratch */ - /* Compute the row and column scalings. */ - pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - - /* Equilibrate matrix A if it is badly-scaled. */ - pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - - if ( lsame_(equed, "R") ) { - ScalePermstruct->DiagScale = rowequ = ROW; - } else if ( lsame_(equed, "C") ) { - ScalePermstruct->DiagScale = colequ = COL; - } else if ( lsame_(equed, "B") ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. equilibrated? *equed = %c\n", *equed); - /*fflush(stdout);*/ - } -#endif - } /* if Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit equil"); -#endif - } /* if Equil ... */ - - if ( !factored ) { /* Skip this if already factored. */ - /* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - * Numerical values are gathered only when a row permutation - * for large diagonal is sought after. - */ - if ( Fact != SamePattern_SameRowPerm ) { - need_value = (options->RowPerm == LargeDiag); - pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - GAstore = (NCformat *) GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; - nnz = GAstore->nnz; - if ( need_value ) a_GA = (double *) GAstore->nzval; - else assert(GAstore->nzval == NULL); - } - - /* ------------------------------------------------------------ - Find the row permutation for A. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - t = SuperLU_timer_(); - if ( Fact != SamePattern_SameRowPerm ) { - if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } else { /* options->RowPerm == LargeDiag */ - /* Get a new perm_r[] */ - if ( job == 5 ) { - /* Allocate storage for scaling factors. */ - if ( !(R1 = doubleMalloc_dist(m)) ) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if ( !(C1 = doubleMalloc_dist(n)) ) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if ( !iam ) { - /* Process 0 finds a row permutation */ - dldperm(job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } else { - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } - -#if ( PRNTlevel>=2 ) - dmin = dlamch_("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if ( job == 5 ) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - - /* Scale the distributed matrix */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - a[i] *= R1[irow] * C1[icol]; -#if ( PRNTlevel>=2 ) - if ( perm_r[irow] == icol ) { /* New diagonal */ - if ( job == 2 || job == 3 ) - dmin = SUPERLU_MIN(dmin, fabs(a[i])); - else if ( job == 4 ) - dsum += fabs(a[i]); - else if ( job == 5 ) - dprod *= fabs(a[i]); - } -#endif - } - ++irow; - } - - /* Multiply together the scaling factors. */ - if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; - else for (i = 0; i < m; ++i) R[i] = R1[i]; - if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; - else for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - -#if ( PRNTlevel>=2 ) - if ( job == 2 || job == 3 ) { - if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); - } else if ( job == 4 ) { - if ( !iam ) printf("\tsum of diagonal %e\n", dsum); - } else if ( job == 5 ) { - if ( !iam ) printf("\t product of diagonal %e\n", dprod); - } -#endif - - } /* end if options->RowPerm ... */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); -#endif - } /* end if Fact ... */ - } else { /* options->RowPerm == NOROWPERM */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - -#if ( DEBUGlevel>=2 ) - if ( !iam ) PrintInt10("perm_r", m, perm_r); -#endif - } /* end if (!factored) */ - - if ( !factored || options->IterRefine ) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if ( notran ) *(unsigned char *)norm = '1'; - else *(unsigned char *)norm = 'I'; - anorm = pdlangs(norm, A, grid); -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. anorm %e\n", anorm); -#endif - } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------*/ - if ( !factored ) { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A - * permc_spec = PARMETIS: parallel METIS on structure of A'+A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - parSymbFact = options->ParSymbFact; - -#if ( PRNTlevel>=1 ) - if ( parSymbFact && permc_spec != PARMETIS ) - if ( !iam ) printf(".. Parallel symbolic factorization" - " only works wth ParMetis!\n"); -#endif - - if ( parSymbFact == YES || permc_spec == PARMETIS ) { - nprocs_num = grid->nprow * grid->npcol; - noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); - - /* create a new communicator for the first noDomains processors in - grid->comm */ - key = iam; - if (iam < noDomains) col = 0; - else col = MPI_UNDEFINED; - MPI_Comm_split (grid->comm, col, key, &symb_comm ); - - permc_spec = PARMETIS; /* only works with PARMETIS */ - } - - if ( permc_spec != MY_PERMC && Fact == DOFACT ) { - if ( permc_spec == PARMETIS ) { - /* Get column permutation vector in perm_c. * - * This routine takes as input the distributed input matrix A * - * and does not modify it. It also allocates memory for * - * sizes[] and fstVtxSep[] arrays, that contain information * - * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); - if (flinfo > 0) - ABORT("ERROR in get perm_c parmetis."); - } else { - get_perm_c_dist(iam, permc_spec, &GA, perm_c); - } - } - - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - if ( Fact != SamePattern_SameRowPerm ) { - if ( parSymbFact == NO ) { - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder(options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *) GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; - } - } - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ -#if ( PRNTlevel>=1 ) - if ( !iam ) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); -#endif - t = SuperLU_timer_(); - if ( !(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) - ABORT("Malloc fails for Glu_freeable."); - - /* Every process does this. */ - iinfo = symbfact(options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if ( iinfo < 0 ) { /* Successful return */ - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); - printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu*1e-6, - symb_mem_usage.total*1e-6, - symb_mem_usage.expansions); - } -#endif - } else { - if ( !iam ) { - fprintf(stderr,"symbfact() error returns %d\n",iinfo); - exit(-1); - } - } - } /* end if serial symbolic factorization */ - else { /* parallel symbolic factorization */ - t = SuperLU_timer_(); - flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (flinfo > 0) - ABORT("Insufficient memory for parallel symbolic factorization."); - } - } /* end if Fact ... */ - -#if ( PRNTlevel>=1 ) - if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); -#endif - if (sizes) SUPERLU_FREE (sizes); - if (fstVtxSep) SUPERLU_FREE (fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free (&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_(); - dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factorization. */ - if ( Fact != SamePattern_SameRowPerm ) { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } - } else { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - t = SuperLU_timer_(); - dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT ("Not enough memory available for dist_psymbtonum\n"); - stat->utime[DIST] = SuperLU_timer_() - t; - } - -#if ( PRNTlevel>=1 ) - if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); -#endif - - /* Perform numerical factorization in parallel. */ - t = SuperLU_timer_(); - pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; - -#if ( PRNTlevel>=1 ) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); - MPI_Reduce( &num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - temp = SUPERLU_MAX(symb_mem_usage.total, - symb_mem_usage.for_lu + - (float)dist_mem_use + num_mem_usage.for_lu); - if (parSymbFact == TRUE) - /* The memory used in the redistribution routine - includes the memory used for storing the symbolic - structure and the memory allocated for numerical - factorization */ - temp = SUPERLU_MAX(symb_mem_usage.total, - (float)dist_mem_use); - temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce( &temp, &max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &temp, &avg, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm ); - stat->TinyPivots = TinyPivots; - if ( !iam ) { - printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", - for_lu*1e-6, total*1e-6); - printf("\tAll space (MB):" - "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); - printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); - } - } -#endif - - /* Destroy GA */ - if ( Fact != SamePattern_SameRowPerm ) - Destroy_CompCol_Matrix_dist(&GA); - } /* end if (!factored) */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------*/ - if ( nrhs ) { - - if ( !(b_work = doubleMalloc_dist(n)) ) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------------ - Scale the right-hand side if equilibration was performed. - ------------------------------------------------------------*/ - if ( notran ) { - if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= R[irow]; - ++irow; - } - b_col += ldb; - } - } - } else if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= C[irow]; - ++irow; - } - b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) - ABORT("Malloc fails for X[]"); - x_col = X; b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; - x_col += ldx; b_col += ldb; - } - - /* ------------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------------*/ - if ( options->SolveInitialized == NO ) { - dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, - SOLVEstruct); - } - - pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); - - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------*/ - if ( options->IterRefine ) { - /* Improve the solution by iterative refinement. */ - int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ - - t = SuperLU_timer_(); - if ( options->RefineInitialized == NO || Fact == DOFACT ) { - /* All these cases need to re-initialize gsmv structure */ - if ( options->RefineInitialized ) - pdgsmv_finalize(SOLVEstruct->gsmv_comm); - pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); - - /* Save a copy of the transformed local col indices - in colind_gsmv[]. */ - if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); - if ( !(it = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for colind_gsmv[]"); - colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; - options->RefineInitialized = YES; - } else if ( Fact == SamePattern || - Fact == SamePattern_SameRowPerm ) { - double at; - int_t k, jcol, p; - /* Swap to beginning the part of A corresponding to the - local part of X, as was done in pdgsmv_init() */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = rowptr[i]; - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - p = SOLVEstruct->row_to_proc[jcol]; - if ( p == iam ) { /* Local */ - at = a[k]; a[k] = a[j]; a[j] = at; - ++k; - } - } - } - - /* Re-use the local col indices of A obtained from the - previous call to pdgsmv_init() */ - for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; - } - - if ( nrhs == 1 ) { /* Use the existing solve structure */ - SOLVEstruct1 = SOLVEstruct; - } else { /* For nrhs > 1, since refinement is performed for RHS - one at a time, the communication structure for pdgstrs - is different than the solve with nrhs RHS. - So we use SOLVEstruct1 for the refinement step. - */ - if ( !(SOLVEstruct1 = (SOLVEstruct_t *) - SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) - ABORT("Malloc fails for SOLVEstruct1"); - /* Copy the same stuff */ - SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; - SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; - SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; - SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; - SOLVEstruct1->diag_len = SOLVEstruct->diag_len; - SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; - SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; - - /* Initialize the *gstrs_comm for 1 RHS. */ - if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) - ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); - } - - pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); - - /* Deallocate the storage associated with SOLVEstruct1 */ - if ( nrhs > 1 ) { - pxgstrs_finalize(SOLVEstruct1->gstrs_comm); - SUPERLU_FREE(SOLVEstruct1); - } - - stat->utime[REFINE] = SuperLU_timer_() - t; - } - - /* Permute the solution matrix B <= Pc'*X. */ - pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); -#if ( DEBUGlevel>=2 ) - printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); - for (i = 0; i < m_loc; ++i) - printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); -#endif - - /* Transform the solution matrix X to a solution of the original - system before the equilibration. */ - if ( notran ) { - if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= C[irow]; - ++irow; - } - b_col += ldb; - } - } - } else if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= R[irow]; - ++irow; - } - b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); -#endif - - /* Deallocate R and/or C if it was not used. */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - } - } - if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) - Destroy_CompCol_Permuted_dist(&GAC); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgssvx()"); -#endif - -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,1280 +0,0 @@ - - -#include -#include "superlu_ddefs.h" - -void -pdgssvx(superlu_options_t *options, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - double B[], int ldb, int nrhs, gridinfo_t *grid, - LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * -- Distributed SuperLU routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * - * Last update: 2/18/2008 - * - * - * Purpose - * ======= - * - * PDGSSVX solves a system of linear equations A*X=B, - * by using Gaussian elimination with "static pivoting" to - * compute the LU factorization of A. - * - * Static pivoting is a technique that combines the numerical stability - * of partial pivoting with the scalability of Cholesky (no pivoting), - * to run accurately and efficiently on large numbers of processors. - * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed - * description of the parallel algorithms. - * - * The input matrices A and B are distributed by block rows. - * Here is a graphical illustration (0-based indexing): - * - * A B - * 0 --------------- ------ - * | | | | - * | | P0 | | - * | | | | - * --------------- ------ - * - fst_row->| | | | - * | | | | | - * m_loc | | P1 | | - * | | | | | - * - | | | | - * --------------- ------ - * | . | |. | - * | . | |. | - * | . | |. | - * --------------- ------ - * - * where, fst_row is the row number of the first row, - * m_loc is the number of rows local to this processor - * These are defined in the 'SuperMatrix' structure, see supermatrix.h. - * - * - * Here are the options for using this code: - * - * 1. Independent of all the other options specified below, the - * user must supply - * - * - B, the matrix of right-hand sides, distributed by block rows, - * and its dimensions ldb (local) and nrhs (global) - * - grid, a structure describing the 2D processor mesh - * - options->IterRefine, which determines whether or not to - * improve the accuracy of the computed solution using - * iterative refinement - * - * On output, B is overwritten with the solution X. - * - * 2. Depending on options->Fact, the user has four options - * for solving A*X=B. The standard option is for factoring - * A "from scratch". (The other options, described below, - * are used when A is sufficiently similar to a previously - * solved problem to save time by reusing part or all of - * the previous factorization.) - * - * - options->Fact = DOFACT: A is factored "from scratch" - * - * In this case the user must also supply - * - * o A, the input matrix - * - * as well as the following options to determine what matrix to - * factorize. - * - * o options->Equil, to specify how to scale the rows and columns - * of A to "equilibrate" it (to try to reduce its - * condition number and so improve the - * accuracy of the computed solution) - * - * o options->RowPerm, to specify how to permute the rows of A - * (typically to control numerical stability) - * - * o options->ColPerm, to specify how to permute the columns of A - * (typically to control fill-in and enhance - * parallelism during factorization) - * - * o options->ReplaceTinyPivot, to specify how to deal with tiny - * pivots encountered during factorization - * (to control numerical stability) - * - * The outputs returned include - * - * o ScalePermstruct, modified to describe how the input matrix A - * was equilibrated and permuted: - * . ScalePermstruct->DiagScale, indicates whether the rows and/or - * columns of A were scaled - * . ScalePermstruct->R, array of row scale factors - * . ScalePermstruct->C, array of column scale factors - * . ScalePermstruct->perm_r, row permutation vector - * . ScalePermstruct->perm_c, column permutation vector - * - * (part of ScalePermstruct may also need to be supplied on input, - * depending on options->RowPerm and options->ColPerm as described - * later). - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix diag(R)*A*diag(C)*Pc^T, where - * Pc is the row permutation matrix determined by - * ScalePermstruct->perm_c - * diag(R) and diag(C) are diagonal scaling matrices determined - * by ScalePermstruct->DiagScale, ScalePermstruct->R and - * ScalePermstruct->C - * - * o LUstruct, which contains the L and U factorization of A1 where - * - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored - * in A on output.) - * - * 3. The second value of options->Fact assumes that a matrix with the same - * sparsity pattern as A has already been factored: - * - * - options->Fact = SamePattern: A is factored, assuming that it has - * the same nonzero pattern as a previously factored matrix. In - * this case the algorithm saves time by reusing the previously - * computed column permutation vector stored in - * ScalePermstruct->perm_c and the "elimination tree" of A - * stored in LUstruct->etree - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->RowPerm - * o options->ReplaceTinyPivot - * - * but not options->ColPerm, whose value is ignored. This is because the - * previous column permutation from ScalePermstruct->perm_c is used as - * input. The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->perm_c, the column permutation - * o LUstruct->etree, the elimination tree - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated and row permuted - * o LUstruct, modified to contain the new L and U factors - * - * 4. The third value of options->Fact assumes that a matrix B with the same - * sparsity pattern as A has already been factored, and where the - * row permutation of B can be reused for A. This is useful when A and B - * have similar numerical values, so that the same row permutation - * will make both factorizations numerically stable. This lets us reuse - * all of the previously computed structure of L and U. - * - * - options->Fact = SamePattern_SameRowPerm: A is factored, - * assuming not only the same nonzero pattern as the previously - * factored matrix B, but reusing B's row permutation. - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->ReplaceTinyPivot - * - * but not options->RowPerm or options->ColPerm, whose values are - * ignored. This is because the permutations from ScalePermstruct->perm_r - * and ScalePermstruct->perm_c are used as input. - * - * The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->DiagScale, how the previous matrix was row - * and/or column scaled - * o ScalePermstruct->R, the row scalings of the previous matrix, - * if any - * o ScalePermstruct->C, the columns scalings of the previous matrix, - * if any - * o ScalePermstruct->perm_r, the row permutation of the previous - * matrix - * o ScalePermstruct->perm_c, the column permutation of the previous - * matrix - * o all of LUstruct, the previously computed information about - * L and U (the actual numerical values of L and U - * stored in LUstruct->Llu are ignored) - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated (thus ScalePermstruct->DiagScale, - * R and C may be modified) - * o LUstruct, modified to contain the new L and U factors - * - * 5. The fourth and last value of options->Fact assumes that A is - * identical to a matrix that has already been factored on a previous - * call, and reuses its entire LU factorization - * - * - options->Fact = Factored: A is identical to a previously - * factorized matrix, so the entire previous factorization - * can be reused. - * - * In this case all the other options mentioned above are ignored - * (options->Equil, options->RowPerm, options->ColPerm, - * options->ReplaceTinyPivot) - * - * The user must also supply - * - * o A, the unfactored matrix, only in the case that iterative - * refinment is to be done (specifically A must be the output - * A from the previous call, so that it has been scaled and permuted) - * o all of ScalePermstruct - * o all of LUstruct, including the actual numerical values of - * L and U - * - * all of which are unmodified on output. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* (global) - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following fields should be defined for this structure: - * - * o Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorized based on the previous history. - * - * = DOFACT: The matrix A will be factorized from scratch. - * Inputs: A - * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * all of ScalePermstruct - * all of LUstruct - * - * = SamePattern: the matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the elimination tree - * LUstruct->etree - * Inputs: A - * options->Equil, RowPerm, ReplaceTinyPivot - * ScalePermstruct->perm_c - * LUstruct->etree - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * rest of ScalePermstruct (DiagScale, R, C, perm_r) - * rest of LUstruct (GLU_persist, Llu) - * - * = SamePattern_SameRowPerm: the matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * Inputs: A - * options->Equil, ReplaceTinyPivot - * all of ScalePermstruct - * all of LUstruct - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * modified LUstruct->Llu - * = FACTORED: the matrix A is already factored. - * Inputs: all of ScalePermstruct - * all of LUstruct - * - * o Equil (yes_no_t) - * Specifies whether to equilibrate the system. - * = NO: no equilibration. - * = YES: scaling factors are computed to equilibrate the system: - * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. - * Whether or not the system will be equilibrated depends - * on the scaling of the matrix A, but if equilibration is - * used, A is overwritten by diag(R)*A*diag(C) and B by - * diag(R)*B. - * - * o RowPerm (rowperm_t) - * Specifies how to permute rows of the matrix A. - * = NATURAL: use the natural ordering. - * = LargeDiag: use the Duff/Koster algorithm to permute rows of - * the original matrix to make the diagonal large - * relative to the off-diagonal. - * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r - * input by the user. - * - * o ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering. - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. - * = MMD_ATA: minimum degree ordering on structure of A'*A. - * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. - * - * o ReplaceTinyPivot (yes_no_t) - * = NO: do not modify pivots - * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during - * LU factorization. - * - * o IterRefine (IterRefine_t) - * Specifies how to perform iterative refinement. - * = NO: no iterative refinement. - * = DOUBLE: accumulate residual in double precision. - * = EXTRA: accumulate residual in extra precision. - * - * NOTE: all options must be indentical on all processes when - * calling this routine. - * - * A (input/output) SuperMatrix* (local) - * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A must be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * That is, A is stored in distributed compressed row format. - * See supermatrix.h for the definition of 'SuperMatrix'. - * This routine only handles square A, however, the LU factorization - * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, - * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by - * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by - * diag(R)*A*diag(C)*Pc^T. - * If all the above condition are true, the LU decomposition is - * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. - * - * ScalePermstruct (input/output) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * It contains the following fields: - * - * o DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: no equilibration. - * = ROW: row equilibration, i.e., A was premultiplied by - * diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied - * by diag(C). - * = BOTH: both row and column equilibration, i.e., A was - * replaced by diag(R)*A*diag(C). - * If options->Fact = FACTORED or SamePattern_SameRowPerm, - * DiagScale is an input argument; otherwise it is an output - * argument. - * - * o perm_r (int*) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->RowPerm = MY_PERMR, or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument; otherwise it is an output argument. - * - * o perm_c (int*) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern - * or options->Fact = SamePattern_SameRowPerm, perm_c is an - * input argument; otherwise, it is an output argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree - * is already in postorder. - * - * o R (double*) dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by - * diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is - * an input argument; otherwise, R is an output argument. - * - * o C (double*) dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by - * diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is - * an input argument; otherwise, C is an output argument. - * - * B (input/output) double* (local) - * On entry, the right-hand side matrix of dimension (m_loc, nrhs), - * where, m_loc is the number of rows stored locally on my - * process and is defined in the data structure of matrix A. - * On exit, the solution matrix if info = 0; - * - * ldb (input) int (local) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * The number of right-hand sides. - * If nrhs = 0, only LU decomposition is performed, the forward - * and back substitutions are skipped. - * - * grid (input) gridinfo_t* (global) - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * It contains the following fields: - * - * o etree (int*) dimension (A->ncol) (global) - * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. - * It is computed in sp_colorder() during the first factorization, - * and is reused in the subsequent factorizations of the matrices - * with the same nonzero pattern. - * On exit of sp_colorder(), the columns of A are permuted so that - * the etree is in a certain postorder. This postorder is reflected - * in ScalePermstruct->perm_c. - * NOTE: - * Etree is a vector of parent pointers for a forest whose vertices - * are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * o Glu_persist (Glu_persist_t*) (global) - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (LocalLU_t*) (local) - * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. - * - * SOLVEstruct (input/output) SOLVEstruct_t* - * The data structure to hold the communication pattern used - * in the phases of triangular solution and iterative refinement. - * This pattern should be intialized only once for repeated solutions. - * If options->SolveInitialized = YES, it is an input argument. - * If options->SolveInitialized = NO and nrhs != 0, it is an output - * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. - * - * berr (output) double*, dimension (nrhs) (global) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - * See superlu_ddefs.h for the definitions of varioous data types. - * - */ - NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ - NCformat *GAstore; - double *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by PDDISTRIBUTE - routine. They will be freed after PDDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - fact_t Fact; - double *a; - int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR*/ - int_t *rowind_loc, *colptr_loc; - int_t colequ, Equil, factored, job, notran, rowequ, need_value; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; - int_t nnz_loc, m_loc, fst_row, icol; - int iam; - int ldx; /* LDA for matrix X (local). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - double *X, *b_col, *b_work, *x_col; - double t; - static mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; -#endif - int_t procs; - - /* Structures needed for parallel symbolic factorization */ - int_t *sizes, *fstVtxSep, parSymbFact; - int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ - Pslu_freeable_t Pslu_freeable; - float flinfo; - - /* Initialization. */ - m = A->nrow; - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = (double *) Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - sizes = NULL; - fstVtxSep = NULL; - symb_comm = MPI_COMM_NULL; - - /* Test the input parameters. */ - *info = 0; - Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) - *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) - *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) - *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) - *info = -1; - else if ( options->IterRefine == EXTRA ) { - *info = -1; - fprintf(stderr, "Extra precise iterative refinement yet to support."); - } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < m_loc ) - *info = -5; - else if ( nrhs < 0 ) - *info = -6; - if ( *info ) { - i = -(*info); - pxerbla("pdgssvx", grid, -*info); - return; - } - - factored = (Fact == FACTORED); - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - iam = grid->iam; - job = 5; - if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } else rowequ = colequ = FALSE; - - /* The following arrays are replicated on all processes. */ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - /********/ - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgssvx()"); -#endif - - /* Not factored & ask for equilibration */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - /* Allocate storage if not done so before. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------*/ - if ( Equil ) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm ) { - /* Reuse R and C. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - a[i] *= R[irow]; /* Scale rows. */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j+1]; ++i){ - icol = colind[i]; - a[i] *= C[icol]; /* Scale columns. */ - } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ - } - ++irow; - } - break; - } - } else { /* Compute R & C from scratch */ - /* Compute the row and column scalings. */ - pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - - /* Equilibrate matrix A if it is badly-scaled. */ - pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - - if ( lsame_(equed, "R") ) { - ScalePermstruct->DiagScale = rowequ = ROW; - } else if ( lsame_(equed, "C") ) { - ScalePermstruct->DiagScale = colequ = COL; - } else if ( lsame_(equed, "B") ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. equilibrated? *equed = %c\n", *equed); - /*fflush(stdout);*/ - } -#endif - } /* if Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit equil"); -#endif - } /* if Equil ... */ - - if ( !factored ) { /* Skip this if already factored. */ - /* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - * Numerical values are gathered only when a row permutation - * for large diagonal is sought after. - */ - if ( Fact != SamePattern_SameRowPerm ) { - need_value = (options->RowPerm == LargeDiag); - pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - GAstore = (NCformat *) GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; - nnz = GAstore->nnz; - if ( need_value ) a_GA = (double *) GAstore->nzval; - else assert(GAstore->nzval == NULL); - } - - /* ------------------------------------------------------------ - Find the row permutation for A. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - t = SuperLU_timer_(); - if ( Fact != SamePattern_SameRowPerm ) { - if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } else { /* options->RowPerm == LargeDiag */ - /* Get a new perm_r[] */ - if ( job == 5 ) { - /* Allocate storage for scaling factors. */ - if ( !(R1 = doubleMalloc_dist(m)) ) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if ( !(C1 = doubleMalloc_dist(n)) ) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if ( !iam ) { - /* Process 0 finds a row permutation */ - dldperm(job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } else { - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } - -#if ( PRNTlevel>=2 ) - dmin = dlamch_("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if ( job == 5 ) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - - /* Scale the distributed matrix */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - a[i] *= R1[irow] * C1[icol]; -#if ( PRNTlevel>=2 ) - if ( perm_r[irow] == icol ) { /* New diagonal */ - if ( job == 2 || job == 3 ) - dmin = SUPERLU_MIN(dmin, fabs(a[i])); - else if ( job == 4 ) - dsum += fabs(a[i]); - else if ( job == 5 ) - dprod *= fabs(a[i]); - } -#endif - } - ++irow; - } - - /* Multiply together the scaling factors. */ - if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; - else for (i = 0; i < m; ++i) R[i] = R1[i]; - if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; - else for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - -#if ( PRNTlevel>=2 ) - if ( job == 2 || job == 3 ) { - if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); - } else if ( job == 4 ) { - if ( !iam ) printf("\tsum of diagonal %e\n", dsum); - } else if ( job == 5 ) { - if ( !iam ) printf("\t product of diagonal %e\n", dprod); - } -#endif - - } /* end if options->RowPerm ... */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); -#endif - } /* end if Fact ... */ - } else { /* options->RowPerm == NOROWPERM */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - -#if ( DEBUGlevel>=2 ) - if ( !iam ) PrintInt10("perm_r", m, perm_r); -#endif - } /* end if (!factored) */ - - if ( !factored || options->IterRefine ) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if ( notran ) *(unsigned char *)norm = '1'; - else *(unsigned char *)norm = 'I'; - anorm = pdlangs(norm, A, grid); -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. anorm %e\n", anorm); -#endif - } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------*/ - if ( !factored ) { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A - * permc_spec = PARMETIS: parallel METIS on structure of A'+A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - parSymbFact = options->ParSymbFact; - -#if ( PRNTlevel>=1 ) - if ( parSymbFact && permc_spec != PARMETIS ) - if ( !iam ) printf(".. Parallel symbolic factorization" - " only works wth ParMetis!\n"); -#endif - - if ( parSymbFact == YES || permc_spec == PARMETIS ) { - nprocs_num = grid->nprow * grid->npcol; - noDomains = (int) ( pow(2, ((int) log2( (double)nprocs_num )))); - - /* create a new communicator for the first noDomains processors in - grid->comm */ - key = iam; - if (iam < noDomains) col = 0; - else col = MPI_UNDEFINED; - MPI_Comm_split (grid->comm, col, key, &symb_comm ); - - permc_spec = PARMETIS; /* only works with PARMETIS */ - } - - if ( permc_spec != MY_PERMC && Fact == DOFACT ) { - if ( permc_spec == PARMETIS ) { - /* Get column permutation vector in perm_c. * - * This routine takes as input the distributed input matrix A * - * and does not modify it. It also allocates memory for * - * sizes[] and fstVtxSep[] arrays, that contain information * - * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); - if (flinfo > 0) - ABORT("ERROR in get perm_c parmetis."); - } else { - get_perm_c_dist(iam, permc_spec, &GA, perm_c); - } - } - - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - if ( Fact != SamePattern_SameRowPerm ) { - if ( parSymbFact == NO ) { - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder(options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *) GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; - } - } - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ -#if ( PRNTlevel>=1 ) - if ( !iam ) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); -#endif - t = SuperLU_timer_(); - if ( !(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) - ABORT("Malloc fails for Glu_freeable."); - - /* Every process does this. */ - iinfo = symbfact(options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if ( iinfo < 0 ) { /* Successful return */ - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); - printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu*1e-6, - symb_mem_usage.total*1e-6, - symb_mem_usage.expansions); - } -#endif - } else { - if ( !iam ) { - fprintf(stderr,"symbfact() error returns %d\n",iinfo); - exit(-1); - } - } - } /* end if serial symbolic factorization */ - else { /* parallel symbolic factorization */ - t = SuperLU_timer_(); - flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (flinfo > 0) - ABORT("Insufficient memory for parallel symbolic factorization."); - } - } /* end if Fact ... */ - - if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); - if (sizes) SUPERLU_FREE (sizes); - if (fstVtxSep) SUPERLU_FREE (fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free (&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_(); - dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factorization. */ - if ( Fact != SamePattern_SameRowPerm ) { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } - } else { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - t = SuperLU_timer_(); - dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT ("Not enough memory available for dist_psymbtonum\n"); - stat->utime[DIST] = SuperLU_timer_() - t; - } - - if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); - - /* Perform numerical factorization in parallel. */ - t = SuperLU_timer_(); - pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; - -#if ( PRNTlevel>=1 ) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); - MPI_Reduce( &num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - temp = SUPERLU_MAX(symb_mem_usage.total, - symb_mem_usage.for_lu + - (float)dist_mem_use + num_mem_usage.for_lu); - if (parSymbFact == TRUE) - /* The memory used in the redistribution routine - includes the memory used for storing the symbolic - structure and the memory allocated for numerical - factorization */ - temp = SUPERLU_MAX(symb_mem_usage.total, - (float)dist_mem_use); - temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce( &temp, &max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &temp, &avg, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm ); - stat->TinyPivots = TinyPivots; - if ( !iam ) { - printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", - for_lu*1e-6, total*1e-6); - printf("\tAll space (MB):" - "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); - printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); - } - } -#endif - - /* Destroy GA */ - if ( Fact != SamePattern_SameRowPerm ) - Destroy_CompCol_Matrix_dist(&GA); - } /* end if (!factored) */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------*/ - if ( nrhs ) { - - if ( !(b_work = doubleMalloc_dist(n)) ) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------------ - Scale the right-hand side if equilibration was performed. - ------------------------------------------------------------*/ - if ( notran ) { - if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= R[irow]; - ++irow; - } - b_col += ldb; - } - } - } else if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= C[irow]; - ++irow; - } - b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) - ABORT("Malloc fails for X[]"); - x_col = X; b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; - x_col += ldx; b_col += ldb; - } - - /* ------------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------------*/ - if ( options->SolveInitialized == NO ) { - dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, - SOLVEstruct); - } - - pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); - - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------*/ - if ( options->IterRefine ) { - /* Improve the solution by iterative refinement. */ - int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ - - t = SuperLU_timer_(); - if ( options->RefineInitialized == NO || Fact == DOFACT ) { - /* All these cases need to re-initialize gsmv structure */ - if ( options->RefineInitialized ) - pdgsmv_finalize(SOLVEstruct->gsmv_comm); - pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); - - /* Save a copy of the transformed local col indices - in colind_gsmv[]. */ - if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); - if ( !(it = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for colind_gsmv[]"); - colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; - options->RefineInitialized = YES; - } else if ( Fact == SamePattern || - Fact == SamePattern_SameRowPerm ) { - double at; - int_t k, jcol, p; - /* Swap to beginning the part of A corresponding to the - local part of X, as was done in pdgsmv_init() */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = rowptr[i]; - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - p = SOLVEstruct->row_to_proc[jcol]; - if ( p == iam ) { /* Local */ - at = a[k]; a[k] = a[j]; a[j] = at; - ++k; - } - } - } - - /* Re-use the local col indices of A obtained from the - previous call to pdgsmv_init() */ - for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; - } - - if ( nrhs == 1 ) { /* Use the existing solve structure */ - SOLVEstruct1 = SOLVEstruct; - } else { /* For nrhs > 1, since refinement is performed for RHS - one at a time, the communication structure for pdgstrs - is different than the solve with nrhs RHS. - So we use SOLVEstruct1 for the refinement step. - */ - if ( !(SOLVEstruct1 = (SOLVEstruct_t *) - SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) - ABORT("Malloc fails for SOLVEstruct1"); - /* Copy the same stuff */ - SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; - SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; - SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; - SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; - SOLVEstruct1->diag_len = SOLVEstruct->diag_len; - SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; - SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; - - /* Initialize the *gstrs_comm for 1 RHS. */ - if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) - ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); - } - - pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); - - /* Deallocate the storage associated with SOLVEstruct1 */ - if ( nrhs > 1 ) { - pxgstrs_finalize(SOLVEstruct1->gstrs_comm); - SUPERLU_FREE(SOLVEstruct1); - } - - stat->utime[REFINE] = SuperLU_timer_() - t; - } - - /* Permute the solution matrix B <= Pc'*X. */ - pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); -#if ( DEBUGlevel>=2 ) - printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); - for (i = 0; i < m_loc; ++i) - printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); -#endif - - /* Transform the solution matrix X to a solution of the original - system before the equilibration. */ - if ( notran ) { - if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= C[irow]; - ++irow; - } - b_col += ldb; - } - } - } else if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - b_col[i] *= R[irow]; - ++irow; - } - b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); -#endif - - /* Deallocate R and/or C if it was not used. */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - } - } - if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) - Destroy_CompCol_Permuted_dist(&GAC); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgssvx()"); -#endif - -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1343 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - */ - -#include -#include "superlu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif - -#if ( VAMPIR>=1 ) -#include -#endif - -/* - * Internal prototypes - */ -static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *, - gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *); -#ifdef _CRAY -static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); -#endif - -/* - * Sketch of the algorithm - * ======================= - * - * The following relations hold: - * * A_kk = L_kk * U_kk - * * L_ik = Aik * U_kk^(-1) - * * U_kj = L_kk^(-1) * A_kj - * - * ---------------------------------- - * | | | - * ----|----------------------------- - * | | \ U_kk| | - * | | \ | U_kj | - * | |L_kk \ | || | - * ----|-------|---------||---------- - * | | | \/ | - * | | | | - * | | | | - * | | | | - * | | L_ik ==> A_ij | - * | | | | - * | | | | - * | | | | - * ---------------------------------- - * - * Handle the first block of columns separately. - * * Factor diagonal and subdiagonal blocks and test for exact - * singularity. ( pdgstrf2(0), one column at a time ) - * * Compute block row of U - * * Update trailing matrix - * - * Loop over the remaining blocks of columns. - * mycol = MYCOL( iam, grid ); - * myrow = MYROW( iam, grid ); - * N = nsupers; - * For (k = 1; k < N; ++k) { - * krow = PROW( k, grid ); - * kcol = PCOL( k, grid ); - * Pkk = PNUM( krow, kcol, grid ); - * - * * Factor diagonal and subdiagonal blocks and test for exact - * singularity. - * if ( mycol == kcol ) { - * pdgstrf2(k), one column at a time - * } - * - * * Parallel triangular solve - * if ( iam == Pkk ) multicast L_k,k to this process row; - * if ( myrow == krow && mycol != kcol ) { - * Recv L_k,k from process Pkk; - * for (j = k+1; j < N; ++j) - * if ( PCOL( j, grid ) == mycol && A_k,j != 0 ) - * U_k,j = L_k,k \ A_k,j; - * } - * - * * Parallel rank-k update - * if ( myrow == krow ) multicast U_k,k+1:N to this process column; - * if ( mycol == kcol ) multicast L_k+1:N,k to this process row; - * if ( myrow != krow ) { - * Pkj = PNUM( krow, mycol, grid ); - * Recv U_k,k+1:N from process Pkj; - * } - * if ( mycol != kcol ) { - * Pik = PNUM( myrow, kcol, grid ); - * Recv L_k+1:N,k from process Pik; - * } - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L_i,k != 0 && U_k,j != 0 ) - * A_i,j = A_i,j - L_i,k * U_k,j; - * } - * } - * - * - * Remaining issues - * (1) Use local indices for L subscripts and SPA. [DONE] - * - */ -/************************************************************************/ -int_t pdgstrf -/************************************************************************/ -( - superlu_options_t *options, int m, int n, double anorm, - LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info - ) -/* - * Purpose - * ======= - * - * PDGSTRF performs the LU factorization in parallel. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following field should be defined: - * o ReplaceTinyPivot (yes_no_t) - * Specifies whether to replace the tiny diagonals by - * sqrt(epsilon)*norm(A) during LU factorization. - * - * m (input) int - * Number of rows in the matrix. - * - * n (input) int - * Number of columns in the matrix. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * The following fields should be defined: - * - * o Glu_persist (input) Glu_persist_t* - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (input/output) LocalLU_t* - * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * - */ -{ -#ifdef _CRAY - _fcd ftcs = _cptofcd("N", strlen("N")); - _fcd ftcs1 = _cptofcd("L", strlen("L")); - _fcd ftcs2 = _cptofcd("N", strlen("N")); - _fcd ftcs3 = _cptofcd("U", strlen("U")); -#endif - double alpha = 1.0, beta = 0.0; - int_t *xsup; - int_t *lsub, *lsub1, *usub, *Usub_buf, - *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - double *lusup, *lusup1, *uval, *Uval_buf, - *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, - lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, - nlb, nub, nsupc, rel, rukp; - int_t Pc, Pr; - int iam, kcol, krow, mycol, myrow, pi, pj; - int j, k, lk, nsupers; - int nsupr, nbrow, segsize; - int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: - * 0 : transferred in Lsub_buf[] - * 1 : transferred in Lval_buf[] - * 2 : transferred in Usub_buf[] - * 3 : transferred in Uval_buf[] - */ - int_t msg0, msg2; - int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; - double **Unzval_br_ptr, **Lnzval_bc_ptr; - int_t *index; - double *nzval; - int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ - double *ucol; - int_t *indirect; - double *tempv, *tempv2d; - int_t iinfo; - int_t *ToRecv, *ToSendD, **ToSendR; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - superlu_scope_t *scp; - float s_eps; - double thresh; - double *tempU2d, *tempu; - int full, ldt, ldu, lead_zero, ncols; - MPI_Request recv_req[4], *send_req; - MPI_Status status; -#if ( DEBUGlevel>=2 ) - int_t num_copy=0, num_update=0; -#endif -#if ( PRNTlevel==3 ) - int_t zero_msg = 0, total_msg = 0; -#endif -#if ( PROFlevel>=1 ) - double t1, t2; - float msg_vol = 0, msg_cnt = 0; - int_t iword = sizeof(int_t), dword = sizeof(double); -#endif - - /* Test the input parameters. */ - *info = 0; - if ( m < 0 ) *info = -2; - else if ( n < 0 ) *info = -3; - if ( *info ) { - pxerbla("pdgstrf", grid, -*info); - return (-1); - } - - /* Quick return if possible. */ - if ( m == 0 || n == 0 ) return 0; - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - s_eps = slamch_("Epsilon"); - thresh = s_eps * anorm; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrf()"); -#endif - - stat->ops[FACT] = 0.0; - - if ( Pr*Pc > 1 ) { - i = Llu->bufmax[0]; - if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lsub_buf."); - Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; - i = Llu->bufmax[1]; - if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lval_buf[]."); - Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; - if ( Llu->bufmax[2] != 0 ) - if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) - ABORT("Malloc fails for Usub_buf[]."); - if ( Llu->bufmax[3] != 0 ) - if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) - ABORT("Malloc fails for Uval_buf[]."); - if ( !(send_req = - (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) - ABORT("Malloc fails for send_req[]."); - } - if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) ) - ABORT("Malloc fails for ujrow[]."); - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); - printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", - Llu->bufmax[0], Llu->bufmax[1], - Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); - } -#endif - - Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; - Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; - Lval_buf_2[0] = Llu->Lval_buf_2[0]; - Lval_buf_2[1] = Llu->Lval_buf_2[1]; - Usub_buf = Llu->Usub_buf; - Uval_buf = Llu->Uval_buf; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; - ToRecv = Llu->ToRecv; - ToSendD = Llu->ToSendD; - ToSendR = Llu->ToSendR; - - ldt = sp_ienv_dist(3); /* Size of maximum supernode */ - if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) - ABORT("Calloc fails for tempv2d[]."); - tempU2d = tempv2d + ldt*ldt; - if ( !(indirect = intMalloc_dist(ldt)) ) - ABORT("Malloc fails for indirect[]."); - k = CEILING( nsupers, Pr ); /* Number of local block rows */ - if ( !(iuip = intMalloc_dist(k)) ) - ABORT("Malloc fails for iuip[]."); - if ( !(ruip = intMalloc_dist(k)) ) - ABORT("Malloc fails for ruip[]."); - -#if ( VAMPIR>=1 ) - VT_symdef(1, "Send-L", "Comm"); - VT_symdef(2, "Recv-L", "Comm"); - VT_symdef(3, "Send-U", "Comm"); - VT_symdef(4, "Recv-U", "Comm"); - VT_symdef(5, "TRF2", "Factor"); - VT_symdef(100, "Factor", "Factor"); - VT_begin(100); - VT_traceon(); -#endif - - /* --------------------------------------------------------------- - Handle the first block column separately to start the pipeline. - --------------------------------------------------------------- */ - if ( mycol == 0 ) { -#if ( VAMPIR>=1 ) - VT_begin(5); -#endif - pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); -#if ( VAMPIR>=1 ) - VT_end(5); -#endif - - scp = &grid->rscp; /* The scope of process row. */ - - /* Process column *kcol* multicasts numeric values of L(:,k) - to process rows. */ - lsub = Lrowind_bc_ptr[0]; - lusup = Lnzval_bc_ptr[0]; - if ( lsub ) { - msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub[1] * SuperSize( 0 ); - } else { - msgcnt[0] = msgcnt[1] = 0; - } - - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[0][pj] != EMPTY ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(1); -#endif - MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, - &send_req[pj] ); - MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, - &send_req[pj+Pc] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, 0, msgcnt[0], msgcnt[1], pj); -#endif -#if ( VAMPIR>=1 ) - VT_end(1); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; -#endif - } - } /* for pj ... */ - } else { /* Post immediate receives. */ - if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, - 0, scp->comm, &recv_req[0] ); - MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, - 1, scp->comm, &recv_req[1] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); -#endif - } - } /* if mycol == 0 */ - - /* ------------------------------------------ - MAIN LOOP: Loop through all block columns. - ------------------------------------------ */ - for (k = 0; k < nsupers; ++k) { - - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - - if ( mycol == kcol ) { - lk = LBj( k, grid ); /* Local block number. */ - - for (pj = 0; pj < Pc; ++pj) { - /* Wait for Isend to complete before using lsub/lusup. */ - if ( ToSendR[lk][pj] != EMPTY ) { - MPI_Wait( &send_req[pj], &status ); - MPI_Wait( &send_req[pj+Pc], &status ); - } - } - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - } else { - if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ - scp = &grid->rscp; /* The scope of process row. */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(2); -#endif - /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[0]);*/ - /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, - (4*k)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[0], &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); - /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, - Llu->bufmax[1]);*/ - /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, - (4*k+1)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[1], &status ); - MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); -#if ( VAMPIR>=1 ) - VT_end(2); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", - iam, k, msgcnt[0], msgcnt[1], kcol); - fflush(stdout); -#endif - lsub = Lsub_buf_2[k%2]; - lusup = Lval_buf_2[k%2]; -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[0] ) ++zero_msg; -#endif - } else msgcnt[0] = 0; - } /* if mycol = Pc(k) */ - - scp = &grid->cscp; /* The scope of process column. */ - - if ( myrow == krow ) { - /* Parallel triangular solve across process row *krow* -- - U(k,j) = L(k,k) \ A(k,j). */ -#ifdef _CRAY - pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); -#else - pdgstrs2(n, k, Glu_persist, grid, Llu, stat); -#endif - - /* Multicasts U(k,:) to process columns. */ - lk = LBi( k, grid ); - usub = Ufstnz_br_ptr[lk]; - uval = Unzval_br_ptr[lk]; - if ( usub ) { - msgcnt[2] = usub[2]; - msgcnt[3] = usub[1]; - } else { - msgcnt[2] = msgcnt[3] = 0; - } - - if ( ToSendD[lk] == YES ) { - for (pi = 0; pi < Pr; ++pi) { - if ( pi != myrow ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(3); -#endif - MPI_Send( usub, msgcnt[2], mpi_int_t, pi, - (4*k+2)%NTAGS, scp->comm); - MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, - (4*k+3)%NTAGS, scp->comm); -#if ( VAMPIR>=1 ) - VT_end(3); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); -#endif - } /* if pi ... */ - } /* for pi ... */ - } /* if ToSendD ... */ - } else { /* myrow != krow */ - if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(4); -#endif - /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[2]);*/ - MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, - (4*k+2)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); - /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, - Llu->bufmax[3]);*/ - MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, - (4*k+3)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); -#if ( VAMPIR>=1 ) - VT_end(4); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif - usub = Usub_buf; - uval = Uval_buf; -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); -#endif -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[2] ) ++zero_msg; -#endif - } else msgcnt[2] = 0; - } /* if myrow == Pr(k) */ - - /* - * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L(i,k) != 0 && U(k,j) != 0 ) - * A(i,j) = A(i,j) - L(i,k) * U(k,j); - */ - msg0 = msgcnt[0]; - msg2 = msgcnt[2]; - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - nsupr = lsub[1]; /* LDA of lusup. */ - if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ - lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; - luptr0 = knsupc; - nlb = lsub[0] - 1; - } else { - lptr0 = BC_HEADER; - luptr0 = 0; - nlb = lsub[0]; - } - lptr = lptr0; - for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ - ib = lsub[lptr]; - lib = LBi( ib, grid ); - iuip[lib] = BR_HEADER; - ruip[lib] = 0; - lptr += LB_DESCRIPTOR + lsub[lptr+1]; - } - nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ - rukp = 0; /* Pointer to nzval[] of U(k,:) */ - klst = FstBlockC( k+1 ); - - /* --------------------------------------------------- - Update the first block column A(:,k+1). - --------------------------------------------------- */ - jb = usub[iukp]; /* Global block number of block U(k,j). */ - if ( jb == k+1 ) { /* First update (k+1)-th block. */ - --nub; - lptr = lptr0; - luptr = luptr0; - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#elif defined (USE_VENDOR_BLAS) - dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#else - hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, - &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt, 1, 1); -#endif - stat->ops[FACT] += 2 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0, it = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - ucol[rel] -= tempv[it++]; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (it = 0, i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - nzval[indirect[rel]] -= tempv[it++]; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* if jb == k+1 */ - } /* if L(:,k) and U(k,:) not empty */ - - - if ( k+1 < nsupers ) { - kcol = PCOL( k+1, grid ); - if ( mycol == kcol ) { -#if ( VAMPIR>=1 ) - VT_begin(5); -#endif - /* Factor diagonal and subdiagonal blocks and test for exact - singularity. */ - pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); -#if ( VAMPIR>=1 ) - VT_end(5); -#endif - - /* Process column *kcol+1* multicasts numeric values of L(:,k+1) - to process rows. */ - lk = LBj( k+1, grid ); /* Local block number. */ - lsub1 = Lrowind_bc_ptr[lk]; - if ( lsub1 ) { - msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub1[1] * SuperSize( k+1 ); - } else { - msgcnt[0] = 0; - msgcnt[1] = 0; - } - scp = &grid->rscp; /* The scope of process row. */ - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[lk][pj] != EMPTY ) { - lusup1 = Lnzval_bc_ptr[lk]; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(1); -#endif - MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, - (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); - MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, - (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); -#if ( VAMPIR>=1 ) - VT_end(1); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, k+1, msgcnt[0], msgcnt[1], pj); -#endif - } - } /* for pj ... */ - } else { /* Post Recv of block column L(:,k+1). */ - if ( ToRecv[k+1] >= 1 ) { - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, - (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); - MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, - (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); -#endif - } - } /* if mycol == Pc(k+1) */ - } /* if k+1 < nsupers */ - - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - /* --------------------------------------------------- - Update all other blocks using block row U(k,:) - --------------------------------------------------- */ - for (j = 0; j < nub; ++j) { - lptr = lptr0; - luptr = luptr0; - jb = usub[iukp]; /* Global block number of block U(k,j). */ - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#elif defined (USE_VENDOR_BLAS) - dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#else - hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, - &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt, 1, 1); -#endif - stat->ops[FACT] += 2 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - /* Skip descriptor. Now point to fstnz index of - block U(i,j). */ - iuip[lib] += UB_DESCRIPTOR; - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0 ; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - ucol[rel] -= tempv[i]; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted for the L blocks. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - nzval[indirect[rel]] -= tempv[i]; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* for j ... */ - } /* if k L(:,k) and U(k,:) are not empty */ - - } - /* ------------------------------------------ - END MAIN LOOP: for k = ... - ------------------------------------------ */ - -#if ( VAMPIR>=1 ) - VT_end(100); - VT_traceoff(); -#endif - - if ( Pr*Pc > 1 ) { - SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ - SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ - if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); - if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); - SUPERLU_FREE(send_req); - } - - SUPERLU_FREE(Llu->ujrow); - SUPERLU_FREE(tempv2d); - SUPERLU_FREE(indirect); - SUPERLU_FREE(iuip); - SUPERLU_FREE(ruip); - - /* Prepare error message. */ - if ( *info == 0 ) *info = n + 1; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - { - float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; - - MPI_Reduce( &msg_cnt, &msg_cnt_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_cnt, &msg_cnt_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - if ( !iam ) { - printf("\tPDGSTRF comm stat:" - "\tAvg\tMax\t\tAvg\tMax\n" - "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", - msg_cnt_sum/Pr/Pc, msg_cnt_max, - msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); - } - } -#endif - if ( iinfo == n + 1 ) *info = 0; - else *info = iinfo; - - -#if ( PRNTlevel==3 ) - MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); - MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # total msg\t%d\n", iinfo); -#endif - -#if ( DEBUGlevel>=2 ) - for (i = 0; i < Pr * Pc; ++i) { - if ( iam == i ) { - dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); - dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); - printf("(%d)\n", iam); - PrintInt10("Recv", nsupers, Llu->ToRecv); - } - MPI_Barrier( grid->comm ); - } -#endif - -#if ( DEBUGlevel>=3 ) - printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); -#endif -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrf()"); -#endif -} /* PDGSTRF */ - - -/************************************************************************/ -static void pdgstrf2 -/************************************************************************/ -( - superlu_options_t *options, - int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, int* info - ) -/* - * Purpose - * ======= - * Factor diagonal and subdiagonal blocks and test for exact singularity. - * Only the process column that owns block column *k* participates - * in the work. - * - * Arguments - * ========= - * - * k (input) int (global) - * The column number of the block column to be factorized. - * - * thresh (input) double (global) - * The threshold value = s_eps * anorm. - * - * Glu_persist (input) Glu_persist_t* - * Global data structures (xsup, supno) replicated on all processes. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Llu (input/output) LocalLU_t* - * Local data structures to store distributed L and U matrices. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the factorization. - * See SuperLUStat_t structure defined in util.h. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * - */ -{ - int c, iam, l, pkk; - int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int luptr; - int_t i, krow, j, jfst, jlst; - int_t nsupc; /* number of columns in the block */ - int_t *xsup = Glu_persist->xsup; - double *lusup, temp; - double *ujrow; - double alpha = -1; - *info = 0; - - /* Quick return. */ - - /* Initialization. */ - iam = grid->iam; - krow = PROW( k, grid ); - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - j = LBj( k, grid ); /* Local block number */ - jfst = FstBlockC( k ); - jlst = FstBlockC( k+1 ); - lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize( k ); - if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; - ujrow = Llu->ujrow; - - luptr = 0; /* Point to the diagonal entries. */ - c = nsupc; - for (j = 0; j < jlst - jfst; ++j) { - /* Broadcast the j-th row (nsupc - j) elements to - the process column. */ - if ( iam == pkk ) { /* Diagonal process. */ - i = luptr; - if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { - if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ -#if ( PRNTlevel>=2 ) - printf("(%d) .. col %d, tiny pivot %e ", - iam, jfst+j, lusup[i]); -#endif - /* Keep the replaced diagonal with the same sign. */ - if ( lusup[i] < 0 ) lusup[i] = -thresh; - else lusup[i] = thresh; -#if ( PRNTlevel>=2 ) - printf("replaced by %e\n", lusup[i]); -#endif - ++(stat->TinyPivots); - } - } - for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; - } -#if 0 - dbcast_col(ujrow, c, pkk, UjROW, grid, &c); -#else - MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); - /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, - grid, COMM_COLUMN, &c);*/ -#endif - -#if ( DEBUGlevel>=2 ) -if ( k == 3329 && j == 2 ) { - if ( iam == pkk ) { - printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); - } else { - printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); - } -} -#endif - - if ( !lusup ) { /* Empty block column. */ - --c; - if ( ujrow[0] == 0.0 ) *info = j+jfst+1; - continue; - } - - /* Test for singularity. */ - if ( ujrow[0] == 0.0 ) { - *info = j+jfst+1; - } else { - /* Scale the j-th column of the matrix. */ - temp = 1.0 / ujrow[0]; - if ( iam == pkk ) { - for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; - stat->ops[FACT] += nsupr-j-1; - } else { - for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; - stat->ops[FACT] += nsupr; - } - } - - /* Rank-1 update of the trailing submatrix. */ - if ( --c ) { - if ( iam == pkk ) { - l = nsupr - j - 1; -#ifdef _CRAY - SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#elif defined (USE_VENDOR_BLAS) - dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#else - hypre_F90_NAME_BLAS(dger,DGER)(&l, &c, &alpha, - &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#endif - stat->ops[FACT] += 2 * l * c; - } else { -#ifdef _CRAY - SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#elif defined (USE_VENDOR_BLAS) - dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#else - hypre_F90_NAME_BLAS(dger,DGER)(&nsupr, &c, &alpha, - &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#endif - stat->ops[FACT] += 2 * nsupr * c; - } - } - - /* Move to the next column. */ - if ( iam == pkk ) luptr += nsupr + 1; - else luptr += nsupr; - - } /* for j ... */ - -} /* PDGSTRF2 */ - - -/************************************************************************/ -static void pdgstrs2 -/************************************************************************/ -#ifdef _CRAY -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 - ) -#else -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat - ) -#endif -/* - * Purpose - * ======= - * Perform parallel triangular solves - * U(k,:) := A(k,:) \ L(k,k). - * Only the process row that owns block row *k* participates - * in the work. - * - * Arguments - * ========= - * - * m (input) int (global) - * Number of rows in the matrix. - * - * k (input) int (global) - * The row number of the block row to be factorized. - * - * Glu_persist (input) Glu_persist_t* - * Global data structures (xsup, supno) replicated on all processes. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Llu (input/output) LocalLU_t* - * Local data structures to store distributed L and U matrices. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the factorization; - * See SuperLUStat_t structure defined in util.h. - * - */ -{ - int iam, pkk; - int incx = 1; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ - int segsize; - int_t nsupc; /* number of columns in the block */ - int_t luptr, iukp, rukp; - int_t b, gb, j, klst, knsupc, lk, nb; - int_t *xsup = Glu_persist->xsup; - int_t *usub; - double *lusup, *uval; - - /* Quick return. */ - lk = LBi( k, grid ); /* Local block number */ - if ( !Llu->Unzval_br_ptr[lk] ) return; - - /* Initialization. */ - iam = grid->iam; - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - klst = FstBlockC( k+1 ); - knsupc = SuperSize( k ); - usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ - uval = Llu->Unzval_br_ptr[lk]; - nb = usub[0]; - iukp = BR_HEADER; - rukp = 0; - if ( iam == pkk ) { - lk = LBj( k, grid ); - nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ - lusup = Llu->Lnzval_bc_ptr[lk]; - } else { - nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ - lusup = Llu->Lval_buf_2[k%2]; - } - - /* Loop through all the row blocks. */ - for (b = 0; b < nb; ++b) { - gb = usub[iukp]; - nsupc = SuperSize( gb ); - iukp += UB_DESCRIPTOR; - - /* Loop through all the segments in the block. */ - for (j = 0; j < nsupc; ++j) { - segsize = klst - usub[iukp++]; - if ( segsize ) { /* Nonzero segment. */ - luptr = (knsupc - segsize) * (nsupr + 1); -#ifdef _CRAY - STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#elif defined (USE_VENDOR_BLAS) - dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L", "N", "U", &segsize, - &lusup[luptr], &nsupr, - &uval[rukp], &incx, 1, 1, 1); -#endif - stat->ops[FACT] += segsize * (segsize + 1); - rukp += segsize; - } - } - } /* for b ... */ - -} /* PDGSTRS2 */ - -static int -probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm, - int buf_size) -{ - MPI_Status status; - int count; - - MPI_Probe( source, tag, comm, &status ); - MPI_Get_count( &status, datatype, &count ); - if ( count > buf_size ) { - printf("(%d) Recv'ed count %d > buffer size $d\n", - iam, count, buf_size); - exit(-1); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1324 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - */ - -#include -#include "superlu_ddefs.h" -#define CRAY_X1 -#if ( VAMPIR>=1 ) -#include -#endif - -/* - * Internal prototypes - */ -static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *, - gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *); -#ifdef _CRAY -static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); -#endif - -/* - * Sketch of the algorithm - * ======================= - * - * The following relations hold: - * * A_kk = L_kk * U_kk - * * L_ik = Aik * U_kk^(-1) - * * U_kj = L_kk^(-1) * A_kj - * - * ---------------------------------- - * | | | - * ----|----------------------------- - * | | \ U_kk| | - * | | \ | U_kj | - * | |L_kk \ | || | - * ----|-------|---------||---------- - * | | | \/ | - * | | | | - * | | | | - * | | | | - * | | L_ik ==> A_ij | - * | | | | - * | | | | - * | | | | - * ---------------------------------- - * - * Handle the first block of columns separately. - * * Factor diagonal and subdiagonal blocks and test for exact - * singularity. ( pdgstrf2(0), one column at a time ) - * * Compute block row of U - * * Update trailing matrix - * - * Loop over the remaining blocks of columns. - * mycol = MYCOL( iam, grid ); - * myrow = MYROW( iam, grid ); - * N = nsupers; - * For (k = 1; k < N; ++k) { - * krow = PROW( k, grid ); - * kcol = PCOL( k, grid ); - * Pkk = PNUM( krow, kcol, grid ); - * - * * Factor diagonal and subdiagonal blocks and test for exact - * singularity. - * if ( mycol == kcol ) { - * pdgstrf2(k), one column at a time - * } - * - * * Parallel triangular solve - * if ( iam == Pkk ) multicast L_k,k to this process row; - * if ( myrow == krow && mycol != kcol ) { - * Recv L_k,k from process Pkk; - * for (j = k+1; j < N; ++j) - * if ( PCOL( j, grid ) == mycol && A_k,j != 0 ) - * U_k,j = L_k,k \ A_k,j; - * } - * - * * Parallel rank-k update - * if ( myrow == krow ) multicast U_k,k+1:N to this process column; - * if ( mycol == kcol ) multicast L_k+1:N,k to this process row; - * if ( myrow != krow ) { - * Pkj = PNUM( krow, mycol, grid ); - * Recv U_k,k+1:N from process Pkj; - * } - * if ( mycol != kcol ) { - * Pik = PNUM( myrow, kcol, grid ); - * Recv L_k+1:N,k from process Pik; - * } - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L_i,k != 0 && U_k,j != 0 ) - * A_i,j = A_i,j - L_i,k * U_k,j; - * } - * } - * - * - * Remaining issues - * (1) Use local indices for L subscripts and SPA. [DONE] - * - */ -/************************************************************************/ -void pdgstrf -/************************************************************************/ -( - superlu_options_t *options, int m, int n, double anorm, - LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info - ) -/* - * Purpose - * ======= - * - * PDGSTRF performs the LU factorization in parallel. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following field should be defined: - * o ReplaceTinyPivot (yes_no_t) - * Specifies whether to replace the tiny diagonals by - * sqrt(epsilon)*norm(A) during LU factorization. - * - * m (input) int - * Number of rows in the matrix. - * - * n (input) int - * Number of columns in the matrix. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * The following fields should be defined: - * - * o Glu_persist (input) Glu_persist_t* - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (input/output) LocalLU_t* - * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * - */ -{ -#ifdef _CRAY - _fcd ftcs = _cptofcd("N", strlen("N")); - _fcd ftcs1 = _cptofcd("L", strlen("L")); - _fcd ftcs2 = _cptofcd("N", strlen("N")); - _fcd ftcs3 = _cptofcd("U", strlen("U")); -#endif - double alpha = 1.0, beta = 0.0; - int_t *xsup; - int_t *lsub, *lsub1, *usub, *Usub_buf, - *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - double *lusup, *lusup1, *uval, *Uval_buf, - *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, - lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, - nlb, nub, nsupc, rel, rukp; - int_t Pc, Pr; - int iam, kcol, krow, mycol, myrow, pi, pj; - int j, k, lk, nsupers; - int nsupr, nbrow, segsize; - int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: - * 0 : transferred in Lsub_buf[] - * 1 : transferred in Lval_buf[] - * 2 : transferred in Usub_buf[] - * 3 : transferred in Uval_buf[] - */ - int_t msg0, msg2; - int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; - double **Unzval_br_ptr, **Lnzval_bc_ptr; - int_t *index; - double *nzval; - int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ - double *ucol; - int_t *indirect; - double *tempv, *tempv2d; - int_t iinfo; - int_t *ToRecv, *ToSendD, **ToSendR; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - superlu_scope_t *scp; - double s_eps, thresh; - double *tempU2d, *tempu; - int full, ldt, ldu, lead_zero, ncols; - MPI_Request recv_req[4], *send_req; - MPI_Status status; -#ifdef CRAY_X1 - int nonzero_segs; -#endif -#if ( DEBUGlevel>=2 ) - int_t num_copy=0, num_update=0; -#endif -#if ( PRNTlevel==3 ) - int_t zero_msg = 0, total_msg = 0; -#endif -#if ( PROFlevel>=1 ) - double t1, t2; - float msg_vol = 0, msg_cnt = 0; - int_t iword = sizeof(int_t), dword = sizeof(double); -#endif - - /* Test the input parameters. */ - *info = 0; - if ( m < 0 ) *info = -2; - else if ( n < 0 ) *info = -3; - if ( *info ) { - pxerbla("pdgstrf", grid, -*info); - return; - } - - /* Quick return if possible. */ - if ( m == 0 || n == 0 ) return; - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - s_eps = slamch_("Epsilon"); - thresh = s_eps * anorm; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrf()"); -#endif - - stat->ops[FACT] = 0.0; - - if ( Pr*Pc > 1 ) { - i = Llu->bufmax[0]; - if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lsub_buf."); - Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; - i = Llu->bufmax[1]; - if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lval_buf[]."); - Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; - if ( Llu->bufmax[2] != 0 ) - if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) - ABORT("Malloc fails for Usub_buf[]."); - if ( Llu->bufmax[3] != 0 ) - if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) - ABORT("Malloc fails for Uval_buf[]."); - if ( !(send_req = - (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) - ABORT("Malloc fails for send_req[]."); - } - if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) ) - ABORT("Malloc fails for ujrow[]."); - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); - printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", - Llu->bufmax[0], Llu->bufmax[1], - Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); - } -#endif - - Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; - Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; - Lval_buf_2[0] = Llu->Lval_buf_2[0]; - Lval_buf_2[1] = Llu->Lval_buf_2[1]; - Usub_buf = Llu->Usub_buf; - Uval_buf = Llu->Uval_buf; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; - ToRecv = Llu->ToRecv; - ToSendD = Llu->ToSendD; - ToSendR = Llu->ToSendR; - - ldt = sp_ienv_dist(3); /* Size of maximum supernode */ - if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) - ABORT("Calloc fails for tempv2d[]."); - tempU2d = tempv2d + ldt*ldt; -#ifdef CRAY_X1 - if ( !(indirect = intMalloc_dist(2*ldt)) ) - ABORT("Malloc fails for indirect[]."); -#else - if ( !(indirect = intMalloc_dist(ldt)) ) - ABORT("Malloc fails for indirect[]."); -#endif - k = CEILING( nsupers, Pr ); /* Number of local block rows */ - if ( !(iuip = intMalloc_dist(k)) ) - ABORT("Malloc fails for iuip[]."); - if ( !(ruip = intMalloc_dist(k)) ) - ABORT("Malloc fails for ruip[]."); - -#if ( VAMPIR>=1 ) - VT_symdef(1, "Send-L", "Comm"); - VT_symdef(2, "Recv-L", "Comm"); - VT_symdef(3, "Send-U", "Comm"); - VT_symdef(4, "Recv-U", "Comm"); - VT_symdef(5, "TRF2", "Factor"); - VT_symdef(100, "Factor", "Factor"); - VT_begin(100); - VT_traceon(); -#endif - - /* --------------------------------------------------------------- - Handle the first block column separately to start the pipeline. - --------------------------------------------------------------- */ - if ( mycol == 0 ) { -#if ( VAMPIR>=1 ) - VT_begin(5); -#endif - pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); -#if ( VAMPIR>=1 ) - VT_end(5); -#endif - - scp = &grid->rscp; /* The scope of process row. */ - - /* Process column *kcol* multicasts numeric values of L(:,k) - to process rows. */ - lsub = Lrowind_bc_ptr[0]; - lusup = Lnzval_bc_ptr[0]; - if ( lsub ) { - msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub[1] * SuperSize( 0 ); - } else { - msgcnt[0] = msgcnt[1] = 0; - } - - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[0][pj] != EMPTY ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(1); -#endif - MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, - &send_req[pj] ); - MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, - &send_req[pj+Pc] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, 0, msgcnt[0], msgcnt[1], pj); -#endif -#if ( VAMPIR>=1 ) - VT_end(1); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; -#endif - } - } /* for pj ... */ - } else { /* Post immediate receives. */ - if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, - 0, scp->comm, &recv_req[0] ); - MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, - 1, scp->comm, &recv_req[1] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); -#endif - } - } /* if mycol == 0 */ - - /* ------------------------------------------ - MAIN LOOP: Loop through all block columns. - ------------------------------------------ */ - for (k = 0; k < nsupers; ++k) { - - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - - if ( mycol == kcol ) { - lk = LBj( k, grid ); /* Local block number. */ - - for (pj = 0; pj < Pc; ++pj) { - /* Wait for Isend to complete before using lsub/lusup. */ - if ( ToSendR[lk][pj] != EMPTY ) { - MPI_Wait( &send_req[pj], &status ); - MPI_Wait( &send_req[pj+Pc], &status ); - } - } - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - } else { - if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ - scp = &grid->rscp; /* The scope of process row. */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(2); -#endif - /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[0]);*/ - /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, - (4*k)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[0], &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); - /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, - Llu->bufmax[1]);*/ - /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, - (4*k+1)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[1], &status ); - MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); -#if ( VAMPIR>=1 ) - VT_end(2); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", - iam, k, msgcnt[0], msgcnt[1], kcol); - fflush(stdout); -#endif - lsub = Lsub_buf_2[k%2]; - lusup = Lval_buf_2[k%2]; -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[0] ) ++zero_msg; -#endif - } else msgcnt[0] = 0; - } /* if mycol = Pc(k) */ - - scp = &grid->cscp; /* The scope of process column. */ - - if ( myrow == krow ) { - /* Parallel triangular solve across process row *krow* -- - U(k,j) = L(k,k) \ A(k,j). */ -#ifdef _CRAY - pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); -#else - pdgstrs2(n, k, Glu_persist, grid, Llu, stat); -#endif - - /* Multicasts U(k,:) to process columns. */ - lk = LBi( k, grid ); - usub = Ufstnz_br_ptr[lk]; - uval = Unzval_br_ptr[lk]; - if ( usub ) { - msgcnt[2] = usub[2]; - msgcnt[3] = usub[1]; - } else { - msgcnt[2] = msgcnt[3] = 0; - } - - if ( ToSendD[lk] == YES ) { - for (pi = 0; pi < Pr; ++pi) { - if ( pi != myrow ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(3); -#endif - MPI_Send( usub, msgcnt[2], mpi_int_t, pi, - (4*k+2)%NTAGS, scp->comm); - MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, - (4*k+3)%NTAGS, scp->comm); -#if ( VAMPIR>=1 ) - VT_end(3); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); -#endif - } /* if pi ... */ - } /* for pi ... */ - } /* if ToSendD ... */ - } else { /* myrow != krow */ - if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(4); -#endif - /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[2]);*/ - MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, - (4*k+2)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); - /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, - Llu->bufmax[3]);*/ - MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, - (4*k+3)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); -#if ( VAMPIR>=1 ) - VT_end(4); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif - usub = Usub_buf; - uval = Uval_buf; -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); -#endif -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[2] ) ++zero_msg; -#endif - } else msgcnt[2] = 0; - } /* if myrow == Pr(k) */ - - /* - * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L(i,k) != 0 && U(k,j) != 0 ) - * A(i,j) = A(i,j) - L(i,k) * U(k,j); - */ - msg0 = msgcnt[0]; - msg2 = msgcnt[2]; - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - nsupr = lsub[1]; /* LDA of lusup. */ - if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ - lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; - luptr0 = knsupc; - nlb = lsub[0] - 1; - } else { - lptr0 = BC_HEADER; - luptr0 = 0; - nlb = lsub[0]; - } - lptr = lptr0; - for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ - ib = lsub[lptr]; - lib = LBi( ib, grid ); - iuip[lib] = BR_HEADER; - ruip[lib] = 0; - lptr += LB_DESCRIPTOR + lsub[lptr+1]; - } - nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ - rukp = 0; /* Pointer to nzval[] of U(k,:) */ - klst = FstBlockC( k+1 ); - - /* --------------------------------------------------- - Update the first block column A(:,k+1). - --------------------------------------------------- */ - jb = usub[iukp]; /* Global block number of block U(k,j). */ - if ( jb == k+1 ) { /* First update (k+1)-th block. */ - --nub; - lptr = lptr0; - luptr = luptr0; - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#else - dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#endif - stat->ops[FACT] += 2 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0, it = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - ucol[rel] -= tempv[it++]; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (it = 0, i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - nzval[indirect[rel]] -= tempv[it++]; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* if jb == k+1 */ - } /* if L(:,k) and U(k,:) not empty */ - - - if ( k+1 < nsupers ) { - kcol = PCOL( k+1, grid ); - if ( mycol == kcol ) { -#if ( VAMPIR>=1 ) - VT_begin(5); -#endif - /* Factor diagonal and subdiagonal blocks and test for exact - singularity. */ - pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); -#if ( VAMPIR>=1 ) - VT_end(5); -#endif - - /* Process column *kcol+1* multicasts numeric values of L(:,k+1) - to process rows. */ - lk = LBj( k+1, grid ); /* Local block number. */ - lsub1 = Lrowind_bc_ptr[lk]; - if ( lsub1 ) { - msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub1[1] * SuperSize( k+1 ); - } else { - msgcnt[0] = 0; - msgcnt[1] = 0; - } - scp = &grid->rscp; /* The scope of process row. */ - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[lk][pj] != EMPTY ) { - lusup1 = Lnzval_bc_ptr[lk]; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(1); -#endif - MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, - (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); - MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, - (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); -#if ( VAMPIR>=1 ) - VT_end(1); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, k+1, msgcnt[0], msgcnt[1], pj); -#endif - } - } /* for pj ... */ - } else { /* Post Recv of block column L(:,k+1). */ - if ( ToRecv[k+1] >= 1 ) { - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, - (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); - MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, - (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); -#endif - } - } /* if mycol == Pc(k+1) */ - } /* if k+1 < nsupers */ - - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - /* --------------------------------------------------- - Update all other blocks using block row U(k,:) - --------------------------------------------------- */ - for (j = 0; j < nub; ++j) { - lptr = lptr0; - luptr = luptr0; - jb = usub[iukp]; /* Global block number of block U(k,j). */ - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#else - dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#endif - stat->ops[FACT] += 2 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - /* Skip descriptor. Now point to fstnz index of - block U(i,j). */ - iuip[lib] += UB_DESCRIPTOR; - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0 ; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - ucol[rel] -= tempv[i]; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted for the L blocks. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - nzval[indirect[rel]] -= tempv[i]; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* for j ... */ - } /* if k L(:,k) and U(k,:) are not empty */ - - } - /* ------------------------------------------ - END MAIN LOOP: for k = ... - ------------------------------------------ */ - -#if ( VAMPIR>=1 ) - VT_end(100); - VT_traceoff(); -#endif - - if ( Pr*Pc > 1 ) { - SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ - SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ - if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); - if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); - SUPERLU_FREE(send_req); - } - - SUPERLU_FREE(Llu->ujrow); - SUPERLU_FREE(tempv2d); - SUPERLU_FREE(indirect); - SUPERLU_FREE(iuip); - SUPERLU_FREE(ruip); - - /* Prepare error message. */ - if ( *info == 0 ) *info = n + 1; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - { - float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; - - MPI_Reduce( &msg_cnt, &msg_cnt_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_cnt, &msg_cnt_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - if ( !iam ) { - printf("\tPDGSTRF comm stat:" - "\tAvg\tMax\t\tAvg\tMax\n" - "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", - msg_cnt_sum/Pr/Pc, msg_cnt_max, - msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); - } - } -#endif - if ( iinfo == n + 1 ) *info = 0; - else *info = iinfo; - - -#if ( PRNTlevel==3 ) - MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); - MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # total msg\t%d\n", iinfo); -#endif - -#if ( PRNTlevel==2 ) - for (i = 0; i < Pr * Pc; ++i) { - if ( iam == i ) { - dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); - dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); - printf("(%d)\n", iam); - PrintInt10("Recv", nsupers, Llu->ToRecv); - } - MPI_Barrier( grid->comm ); - } -#endif - -#if ( DEBUGlevel>=3 ) - printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); -#endif -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrf()"); -#endif -} /* PDGSTRF */ - - -/************************************************************************/ -static void pdgstrf2 -/************************************************************************/ -( - superlu_options_t *options, - int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, int* info - ) -/* - * Purpose - * ======= - * Factor diagonal and subdiagonal blocks and test for exact singularity. - * Only the process column that owns block column *k* participates - * in the work. - * - * Arguments - * ========= - * - * k (input) int (global) - * The column number of the block column to be factorized. - * - * thresh (input) double (global) - * The threshold value = s_eps * anorm. - * - * Glu_persist (input) Glu_persist_t* - * Global data structures (xsup, supno) replicated on all processes. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Llu (input/output) LocalLU_t* - * Local data structures to store distributed L and U matrices. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the factorization. - * See SuperLUStat_t structure defined in util.h. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * - */ -{ - int c, iam, l, pkk; - int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int luptr; - int_t i, krow, j, jfst, jlst; - int_t nsupc; /* number of columns in the block */ - int_t *xsup = Glu_persist->xsup; - double *lusup, temp; - double *ujrow; - double alpha = -1; - *info = 0; - - /* Quick return. */ - - /* Initialization. */ - iam = grid->iam; - krow = PROW( k, grid ); - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - j = LBj( k, grid ); /* Local block number */ - jfst = FstBlockC( k ); - jlst = FstBlockC( k+1 ); - lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize( k ); - if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; - ujrow = Llu->ujrow; - - luptr = 0; /* Point to the diagonal entries. */ - c = nsupc; - for (j = 0; j < jlst - jfst; ++j) { - /* Broadcast the j-th row (nsupc - j) elements to - the process column. */ - if ( iam == pkk ) { /* Diagonal process. */ - i = luptr; - if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { - if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ -#if ( PRNTlevel>=2 ) - printf("(%d) .. col %d, tiny pivot %e ", - iam, jfst+j, lusup[i]); -#endif - /* Keep the replaced diagonal with the same sign. */ - if ( lusup[i] < 0 ) lusup[i] = -thresh; - else lusup[i] = thresh; -#if ( PRNTlevel>=2 ) - printf("replaced by %e\n", lusup[i]); -#endif - ++(stat->TinyPivots); - } - } - for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; - } -#if 0 - dbcast_col(ujrow, c, pkk, UjROW, grid, &c); -#else - MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); - /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, - grid, COMM_COLUMN, &c);*/ -#endif - -#if ( DEBUGlevel>=2 ) -if ( k == 3329 && j == 2 ) { - if ( iam == pkk ) { - printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); - } else { - printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); - } -} -#endif - - if ( !lusup ) { /* Empty block column. */ - --c; - if ( ujrow[0] == 0.0 ) *info = j+jfst+1; - continue; - } - - /* Test for singularity. */ - if ( ujrow[0] == 0.0 ) { - *info = j+jfst+1; - } else { - /* Scale the j-th column of the matrix. */ - temp = 1.0 / ujrow[0]; - if ( iam == pkk ) { - for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; - stat->ops[FACT] += nsupr-j-1; - } else { - for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; - stat->ops[FACT] += nsupr; - } - } - - /* Rank-1 update of the trailing submatrix. */ - if ( --c ) { - if ( iam == pkk ) { - l = nsupr - j - 1; -#ifdef _CRAY - SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#else - dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#endif - stat->ops[FACT] += 2 * l * c; - } else { -#ifdef _CRAY - SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#else - dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#endif - stat->ops[FACT] += 2 * nsupr * c; - } - } - - /* Move to the next column. */ - if ( iam == pkk ) luptr += nsupr + 1; - else luptr += nsupr; - - } /* for j ... */ - -} /* PDGSTRF2 */ - - -/************************************************************************/ -static void pdgstrs2 -/************************************************************************/ -#ifdef _CRAY -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 - ) -#else -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat - ) -#endif -/* - * Purpose - * ======= - * Perform parallel triangular solves - * U(k,:) := A(k,:) \ L(k,k). - * Only the process column that owns block column *k* participates - * in the work. - * - * Arguments - * ========= - * - * m (input) int (global) - * Number of rows in the matrix. - * - * k (input) int (global) - * The row number of the block row to be factorized. - * - * Glu_persist (input) Glu_persist_t* - * Global data structures (xsup, supno) replicated on all processes. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Llu (input/output) LocalLU_t* - * Local data structures to store distributed L and U matrices. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the factorization; - * See SuperLUStat_t structure defined in util.h. - * - */ -{ - int iam, pkk; - int incx = 1; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ - int segsize; - int_t nsupc; /* number of columns in the block */ - int_t luptr, iukp, rukp; - int_t b, gb, j, klst, knsupc, lk, nb; - int_t *xsup = Glu_persist->xsup; - int_t *usub; - double *lusup, *uval; - - /* Quick return. */ - lk = LBi( k, grid ); /* Local block number */ - if ( !Llu->Unzval_br_ptr[lk] ) return; - - /* Initialization. */ - iam = grid->iam; - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - klst = FstBlockC( k+1 ); - knsupc = SuperSize( k ); - usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ - uval = Llu->Unzval_br_ptr[lk]; - nb = usub[0]; - iukp = BR_HEADER; - rukp = 0; - if ( iam == pkk ) { - lk = LBj( k, grid ); - nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ - lusup = Llu->Lnzval_bc_ptr[lk]; - } else { - nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ - lusup = Llu->Lval_buf_2[k%2]; - } - - /* Loop through all the row blocks. */ - for (b = 0; b < nb; ++b) { - gb = usub[iukp]; - nsupc = SuperSize( gb ); - iukp += UB_DESCRIPTOR; - - /* Loop through all the segments in the block. */ - for (j = 0; j < nsupc; ++j) { - segsize = klst - usub[iukp++]; - if ( segsize ) { /* Nonzero segment. */ - luptr = (knsupc - segsize) * (nsupr + 1); -#ifdef _CRAY - STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#else - dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#endif - stat->ops[FACT] += segsize * (segsize + 1); - rukp += segsize; - } - } - } /* for b ... */ - -} /* PDGSTRS2 */ - -static int -probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm, - int buf_size) -{ - MPI_Status status; - int count; - - MPI_Probe( source, tag, comm, &status ); - MPI_Get_count( &status, datatype, &count ); - if ( count > buf_size ) { - printf("(%d) Recv'ed count %d > buffer size $d\n", - iam, count, buf_size); - exit(-1); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,819 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_ddefs.h" - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); -fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - - -void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - double *x, int nrhs, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PDGSTRS1 solves a system of distributed linear equations - * - * op( sub(A) ) * X = sub( B ) - * - * with a general N-by-N distributed matrix sub( A ) using the LU - * factorization computed by PDGSTRF. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures to store L and U factors, - * and the permutation vectors. - * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * x (input/output) double* - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * NOTE: the right-hand side matrix is already distributed on - * the diagonal processes. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves; - * See SuperLUStat_t structure defined in util.h. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double alpha = 1.0; - double *lsum; /* Local running sum of the updates to B-components */ - double *lusup, *dest; - double *recvbuf, *tempv; - double *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - double **Lnzval_bc_ptr; - MPI_Status status; -#ifdef ISEND_IRECV - MPI_Request *send_req, recv_req; -#endif - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for L-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -8; - if ( *info ) { - pxerbla("PDGSTRS1", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - Llu->SolveMsgSent = 0; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrs1()"); -#endif - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS1. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#ifdef ISEND_IRECV - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Compute ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs - + nlb * LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); - if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - - /* - * Prepended the block number in the header for lsum[]. - */ - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H] = k; - } - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( !frecv[lk] && !fmod[lk] ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, - pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* - * Compute the internal nodes asynchronously by all processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#ifdef ISEND_IRECV - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &recv_req ); - MPI_Wait( &recv_req, &status ); -#else - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); -#endif - - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( DEBUGlevel>=2 ) - if ( !iam ) printf("\n.. After L-solve: y =\n"); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - } - MPI_Barrier( grid->comm ); - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - Llu->SolveMsgSent = 0; -#endif - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS1. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; - } - } - - /* Set up additional pointers for the index and value arrays of U. - nlb is the number of local block rows. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( !brecv[lk] && !bmod[lk] ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - - if ( !(--brecv[lk]) && !bmod[lk] ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - - /* Deallocate storage. */ - - SUPERLU_FREE(lsum); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - SUPERLU_FREE(send_req); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrs1()"); -#endif - -} /* PDGSTRS1 */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,998 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_ddefs.h" - - -/*#define ISEND_IRECV*/ - -/* Parry's change - Use MPI_Bsend with a large buffer attached in the main program */ -#define BSEND 1 - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); -fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - - -void -pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *B, - int_t ldb, int nrhs, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pdgstrs_Bglobal solves a system of distributed linear equations - * A*X = B with a general N-by-N matrix A using the LU factorization - * computed by pdgstrf. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input/output) double* - * On entry, the right-hand side matrix of the possibly equilibrated - * and row permuted system. - * On exit, the solution matrix of the possibly equilibrated - * and row permuted system if info = 0; - * - * NOTE: Currently, the N-by-NRHS matrix B must reside on all - * processes when calling this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double alpha = 1.0; - double *lsum; /* Local running sum of the updates to B-components */ - double *x; /* X component at step k. */ - double *lusup, *dest; - double *recvbuf, *tempv; - double *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - double **Lnzval_bc_ptr; - MPI_Status status; -#if defined(ISEND_IRECV) || defined(BSEND) - MPI_Request *send_req, recv_req; - int test_flag; -#endif - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for L-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - /*-- Function prototypes --*/ - extern void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], double [], int_t, double []); - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerbla("PDGSTRS_BGLOBAL", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; -#ifdef BSEND - if(!iam) { - printf("Using MPI_Bsend in triangular solve\n"); - fflush(stdout); - } -#endif - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - - stat->ops[SOLVE] = 0.0; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()"); -#endif - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS_BGLOBAL. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#if defined(ISEND_IRECV) || defined(BSEND) - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); - for (i = 0; i < Pr; ++i) send_req[i] = MPI_REQUEST_NULL; -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H))) - ABORT("Calloc fails for lsum[]."); - if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) - ABORT("Malloc fails for x[]."); - if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - - /* - * Copy B into X on the diagonal processes. - */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ - kcol = PCOL( k, grid ); - if ( mycol == kcol ) { /* Diagonal process. */ - jj = X_BLK( lk ); - x[jj - XK_H] = k; /* Block number prepended in the header. */ - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ - x[i + jj + j*knsupc] = B[i + ii + j*ldb]; - } - } - ii += knsupc; - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=1 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( frecv[lk]==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV -#if 1 - MPI_Test( &send_req[p], &test_flag, &status ); -#else - if ( send_req[p] != MPI_REQUEST_NULL ) - MPI_Wait( &send_req[p], &status ); -#endif - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req,stat); -#ifdef ISEND_IRECV - /* Wait for previous Isends to complete. */ - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) - /*MPI_Wait( &send_req[p], &status );*/ - MPI_Test( &send_req[p], &test_flag, &status ); - } -#endif - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* ----------------------------------------------------------- - Compute the internal nodes asynchronously by all processes. - ----------------------------------------------------------- */ -#if ( DEBUGlevel>=1 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#ifdef ISEND_IRECV - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &recv_req ); - MPI_Wait( &recv_req, &status ); -#else - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); -#endif - - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV -#if 1 - MPI_Test( &send_req[p], &test_flag, &status ); -#else - if ( send_req[p] != MPI_REQUEST_NULL ) - MPI_Wait( &send_req[p], &status ); -#endif - MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[p]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); -#ifdef ISEND_IRECV - /* Wait for the previous Isends to complete. */ - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) - MPI_Test( &send_req[p], &test_flag, &status ); - } -#endif - } /* if */ - - break; - -#if ( DEBUGlevel>=1 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( PRNTlevel==2 ) - if ( !iam ) printf("\n.. After L-solve: y =\n"); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - } - MPI_Barrier( grid->comm ); - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - - /* MPI_Barrier( grid->comm ); Drain messages in the forward solve. */ - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS_BGLOBAL. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; - } - } - - /* Set up additional pointers for the index and value arrays of U. - nlb is the number of local block rows. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=3 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=1 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( brecv[lk]==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV -#if 1 - MPI_Test( &send_req[p], &test_flag, &status ); -#else - if ( send_req[p] != MPI_REQUEST_NULL ) - MPI_Wait( &send_req[p], &status ); -#endif - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); -#ifdef ISEND_IRECV - /* Wait for the previous Isends to complete. */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) - /*MPI_Wait( &send_req[p], &status );*/ - MPI_Test( &send_req[p], &test_flag, &status ); - } -#endif - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); - - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - - if ( (--brecv[lk])==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV -#if 1 - MPI_Test( &send_req[p], &test_flag, &status ); -#else - if ( send_req[p] != MPI_REQUEST_NULL ) - MPI_Wait( &send_req[p], &status ); -#endif - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[p] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); -#ifdef ISEND_IRECV - /* Wait for the previous Isends to complete. */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) - /*MPI_Wait( &send_req[p], &status );*/ - MPI_Test( &send_req[p], &test_flag, &status ); - } -#endif - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=1 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=3 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - - - /* Copy the solution X into B (on all processes). */ - { - int_t num_diag_procs, *diag_procs, *diag_len; - double *work; - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); - if ( !(work = doubleMalloc_dist(jj*nrhs)) ) - ABORT("Malloc fails for work[]"); - gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, - grid, num_diag_procs, diag_procs, diag_len, - B, ldb, work); - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - SUPERLU_FREE(work); - } - - /* Deallocate storage. */ - - SUPERLU_FREE(lsum); - SUPERLU_FREE(x); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (p = 0; p < Pr; ++p) { - if ( send_req[p] != MPI_REQUEST_NULL ) - MPI_Wait( &send_req[p], &status ); - } - SUPERLU_FREE(send_req); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()"); -#endif -/* Chao debug */ - - MPI_Barrier( grid->comm ); /* Drain messages in the forward solve. */ - -} /* PDGSTRS_BGLOBAL */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_diag_to_all(int_t n, int_t nrhs, double x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - double y[], int_t ldy, double work[]) -{ - int_t i, ii, j, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - double *x_col, *y_col; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ - x_col = &x[ii]; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; - lwork += knsupc; - x_col += knsupc; - } - } - MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - y_col = &y[ii]; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; - lwork += knsupc; - y_col += ldy; - } - } - } -} /* GATHER_DIAG_TO_ALL */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,955 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_ddefs.h" - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); -fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif -static void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], double [], int_t, double []); - - -void -pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - double *B, int_t ldb, int nrhs, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pdgstrs_Bglobal solves a system of distributed linear equations - * A*X = B with a general N-by-N matrix A using the LU factorization - * computed by pdgstrf. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input/output) double* - * On entry, the right-hand side matrix of the possibly equilibrated - * and row permuted system. - * On exit, the solution matrix of the possibly equilibrated - * and row permuted system if info = 0; - * - * NOTE: Currently, the N-by-NRHS matrix B must reside on all - * processes when calling this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double alpha = 1.0; - double *lsum; /* Local running sum of the updates to B-components */ - double *x; /* X component at step k. */ - double *lusup, *dest; - double *recvbuf, *tempv; - double *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - double **Lnzval_bc_ptr; - MPI_Status status; -#if defined (ISEND_IRECV) || defined (BSEND) - MPI_Request *send_req, recv_req; -#endif - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for L-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerbla("PDGSTRS_BGLOBAL", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - stat->ops[SOLVE] = 0.0; - Llu->SolveMsgSent = 0; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()"); -#endif - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS_BGLOBAL. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#if defined (ISEND_IRECV) || defined (BSEND) - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs - + nlb * LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs - + nlb * XK_H)) ) - ABORT("Malloc fails for x[]."); - if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - - /* - * Copy B into X on the diagonal processes. - */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ - kcol = PCOL( k, grid ); - if ( mycol == kcol ) { /* Diagonal process. */ - jj = X_BLK( lk ); - x[jj - XK_H] = k; /* Block number prepended in the header. */ - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ - x[i + jj + j*knsupc] = B[i + ii + j*ldb]; - } - } - ii += knsupc; - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( frecv[lk]==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, - pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req,stat); - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* ----------------------------------------------------------- - Compute the internal nodes asynchronously by all processes. - ----------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#ifdef ISEND_IRECV - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &recv_req ); - MPI_Wait( &recv_req, &status ); -#else - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); -#endif - - k = *recvbuf; - - - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( DEBUGlevel>=2 ) - printf("\n(%d) .. After L-solve: y =\n", iam); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - } - MPI_Barrier( grid->comm ); - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - Llu->SolveMsgSent = 0; -#endif - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS_BGLOBAL. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; - } - } - - /* Set up additional pointers for the index and value arrays of U. - nub is the number of local block columns. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( brecv[lk]==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); - - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - - if ( (--brecv[lk])==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - - - /* Copy the solution X into B (on all processes). */ - { - int_t num_diag_procs, *diag_procs, *diag_len; - double *work; - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); - if ( !(work = doubleMalloc_dist(((size_t)jj)*nrhs)) ) - ABORT("Malloc fails for work[]"); - gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, - grid, num_diag_procs, diag_procs, diag_len, - B, ldb, work); - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - SUPERLU_FREE(work); - } - - /* Deallocate storage. */ - - SUPERLU_FREE(lsum); - SUPERLU_FREE(x); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - SUPERLU_FREE(send_req); -#endif -#ifdef BSEND - SUPERLU_FREE(send_req); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()"); -#endif - -} /* PDGSTRS_BGLOBAL */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_diag_to_all(int_t n, int_t nrhs, double x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - double y[], int_t ldy, double work[]) -{ - int_t i, ii, j, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - double *x_col, *y_col; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ - x_col = &x[ii]; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; - lwork += knsupc; - x_col += knsupc; - } - } - MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - y_col = &y[ii]; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; - lwork += knsupc; - y_col += ldy; - } - } - } -} /* GATHER_DIAG_TO_ALL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1200 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include "superlu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - - -int_t -pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, - int_t fst_row, int_t *ilsum, double *x, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, - gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) -{ -/* - * Purpose - * ======= - * Re-distribute B on the diagonal processes of the 2D process mesh. - * - * Note - * ==== - * This routine can only be called after the routine pxgstrs_init(), - * in which the structures of the send and receive buffers are set up. - * - * Arguments - * ========= - * - * B (input) double* - * The distributed right-hand side matrix of the possibly - * equilibrated system. - * - * m_loc (input) int (local) - * The local row dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * ldb (input) int (local) - * Leading dimension of matrix B. - * - * fst_row (input) int (global) - * The row number of B's first row in the global matrix. - * - * ilsum (input) int* (global) - * Starting position of each supernode in a full array. - * - * x (output) double* - * The solution vector. It is valid only on the diagonal processes. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * SOLVEstruct (input) SOLVEstruct_t* - * Contains the information for the communication during the - * solution phase. - * - * Return value - * ============ - * - */ - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *perm_r, *perm_c; /* row and column permutation vectors */ - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - int_t *xsup, *supno; - int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; - int p, procs; - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - SendCnt = gstrs_comm->B_to_X_SendCnt; - SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; - RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; - sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; - rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - /* ------------------------------------------------------------ - NOW COMMUNICATE THE ACTUAL DATA. - ------------------------------------------------------------*/ - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls[p] * nrhs; - } - - /* Copy the row indices and values to the send buffer. */ - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { - irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum( irow ); - p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ - k = ptr_to_ibuf[p]; - send_ibuf[k] = irow; - k = ptr_to_dbuf[p]; - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - send_dbuf[k++] = B[i + j*ldb]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; - } - - /* Communicate the (permuted) row indices. */ - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - - /* Communicate the numerical values. */ - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); - - /* ------------------------------------------------------------ - Copy buffer into X on the diagonal processes. - ------------------------------------------------------------*/ - ii = 0; - for (p = 0; p < procs; ++p) { - jj = rdispls_nrhs[p]; - for (i = 0; i < RecvCnt[p]; ++i) { - /* Only the diagonal processes do this; the off-diagonal processes - have 0 RecvCnt. */ - irow = recv_ibuf[ii]; /* The permuted row index. */ - k = BlockNum( irow ); - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number. */ - l = X_BLK( lk ); - x[l - XK_H] = k; /* Block number prepended in the header. */ - irow = irow - FstBlockC(k); /* Relative row number in X-block */ - RHS_ITERATE(j) { - x[l + irow + j*knsupc] = recv_dbuf[jj++]; - } - ++ii; - } - } - - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()"); -#endif - return 0; -} /* pdReDistribute_B_to_X */ - -int_t -pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row, - int_t nrhs, double *x, int_t *ilsum, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) -{ -/* - * Purpose - * ======= - * Re-distribute X on the diagonal processes to B distributed on all - * the processes. - * - * Note - * ==== - * This routine can only be called after the routine pxgstrs_init(), - * in which the structures of the send and receive buffers are set up. - * - */ - int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; - int_t *xsup, *supno; - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - int iam, p, q, pkk, procs; - int_t num_diag_procs, *diag_procs; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n-1] + 1; - iam = grid->iam; - procs = grid->nprow * grid->npcol; - - SendCnt = gstrs_comm->X_to_B_SendCnt; - SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; - RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; - sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; - rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls_nrhs[p]; - } - num_diag_procs = SOLVEstruct->num_diag_procs; - diag_procs = SOLVEstruct->diag_procs; - - for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number */ - irow = FstBlockC( k ); - l = X_BLK( lk ); - for (i = 0; i < knsupc; ++i) { -#if 0 - ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ -#else - ii = irow; -#endif - q = row_to_proc[ii]; - jj = ptr_to_ibuf[q]; - send_ibuf[jj] = ii; - jj = ptr_to_dbuf[q]; - RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ - send_dbuf[jj++] = x[l + i + j*knsupc]; - } - ++ptr_to_ibuf[q]; - ptr_to_dbuf[q] += nrhs; - ++irow; - } - } - } - } - - /* ------------------------------------------------------------ - COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. - ------------------------------------------------------------*/ - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); - - /* ------------------------------------------------------------ - COPY THE BUFFER INTO B. - ------------------------------------------------------------*/ - for (i = 0, k = 0; i < m_loc; ++i) { - irow = recv_ibuf[i]; - irow -= fst_row; /* Relative row number */ - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - B[irow + j*ldb] = recv_dbuf[k++]; - } - } - - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()"); -#endif - return 0; - -} /* pdReDistribute_X_to_B */ - - -void -pdgstrs(int_t n, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, - gridinfo_t *grid, double *B, - int_t m_loc, int_t fst_row, int_t ldb, int nrhs, - SOLVEstruct_t *SOLVEstruct, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PDGSTRS solves a system of distributed linear equations - * A*X = B with a general N-by-N matrix A using the LU factorization - * computed by PDGSTRF. - * If the equilibration, and row and column permutations were performed, - * the LU factorization was performed for A1 where - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * and the linear system solved is - * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and - * the permutation to B1 by Pc*Pr is applied internally in this routine. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from PDGSTRF for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * A may be scaled and permuted into A1, so that - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_defs.h for the definition of 'gridinfo_t'. - * - * B (input/output) double* - * On entry, the distributed right-hand side matrix of the possibly - * equilibrated system. That is, B may be overwritten by diag(R)*B. - * On exit, the distributed solution matrix Y of the possibly - * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, - * and X is the solution of the original system. - * - * m_loc (input) int (local) - * The local row dimension of matrix B. - * - * fst_row (input) int (global) - * The row number of B's first row in the global matrix. - * - * ldb (input) int (local) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * SOLVEstruct (output) SOLVEstruct_t* (global) - * Contains the information for the communication during the - * solution phase. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double alpha = 1.0; - double zero = 0.0; - double *lsum; /* Local running sum of the updates to B-components */ - double *x; /* X component at step k. */ - /* NOTE: x and lsum are of same size. */ - double *lusup, *dest; - double *recvbuf, *tempv; - double *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *supno, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - double **Lnzval_bc_ptr; - MPI_Status status; -#ifdef ISEND_IRECV - MPI_Request *send_req, recv_req; -#endif - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve -- - Count the number of local block products to - be summed into lsum[lk]. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of lsum[lk] contributions to be received - from processes in this row. - It is only valid on the diagonal processes. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerbla("PDGSTRS", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrs()"); -#endif - - stat->ops[SOLVE] = 0.0; - Llu->SolveMsgSent = 0; - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#ifdef ISEND_IRECV - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) - ABORT("Malloc fails for x[]."); - if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - /* Redistribute B into X on the diagonal processes. */ - pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, - ScalePermstruct, Glu_persist, grid, SOLVEstruct); - - /* Set up the headers in lsum[]. */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ - } - ii += knsupc; - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( frecv[lk]==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "L", "N", "U", - &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* ----------------------------------------------------------- - Compute the internal nodes asynchronously by all processes. - ----------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#ifdef ISEND_IRECV - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); - MPI_Wait( &recv_req, &status ); -#else - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); -#endif - - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - } - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "L", "N", "U", - &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( DEBUGlevel==2 ) - { - printf("(%d) .. After L-solve: y =\n", iam); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - Llu->SolveMsgSent = 0; -#endif - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; - } - } - } - - /* Set up additional pointers for the index and value arrays of U. - nub is the number of local block columns. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=3 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( brecv[lk]==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "U", "N", "N", - &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#endif - stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, - grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); - k = *recvbuf; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - } - - if ( (--brecv[lk])==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "U", "N", "N", - &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#endif - stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, - grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=3 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - -#if ( DEBUGlevel>=2 ) - { - double *x_col; - int diag; - printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam); - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - diag = PNUM( krow, kcol, grid); - if ( iam == diag ) { /* Diagonal process. */ - lk = LBi( k, grid ); - jj = X_BLK( lk ); - x_col = &x[jj]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) { /* X stored in blocks */ - printf("\t(%d)\t%4d\t%.10f\n", - iam, xsup[k]+i, x_col[i]); - } - x_col += knsupc; - } - } - ii += knsupc; - } /* for k ... */ - } -#endif - - pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, - ScalePermstruct, Glu_persist, grid, SOLVEstruct); - - - /* Deallocate storage. */ - SUPERLU_FREE(lsum); - SUPERLU_FREE(x); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) { - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - SUPERLU_FREE(send_req); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrs()"); -#endif - -} /* PDGSTRS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,363 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); -fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - -/************************************************************************/ -void dlsum_fmod -/************************************************************************/ -( - double *lsum, /* Sum of local modifications. */ - double *x, /* X array (local) */ - double *xk, /* X[k]. */ - double *rtemp, /* Result of full matrix-vector multiply. */ - int nrhs, /* Number of right-hand sides. */ - int knsupc, /* Size of supernode k. */ - int_t k, /* The k-th component of X. */ - int_t *fmod, /* Modification count for L-solve. */ - int_t nlb, /* Number of L blocks. */ - int_t lptr, /* Starting position in lsub[*]. */ - int_t luptr, /* Starting position in lusup[*]. */ - int_t *xsup, - gridinfo_t *grid, - LocalLU_t *Llu, - MPI_Request send_req[], - SuperLUStat_t *stat -) -{ -/* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= L_i,k * X[k]. - */ - double alpha = 1.0, beta = 0.0; - double *lusup, *lusup1; - double *dest; - int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; - int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel; - int_t *lsub, *lsub1, nlb1, lptr1, luptr1; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *frecv = Llu->frecv; - int_t **fsendx_plist = Llu->fsendx_plist; - MPI_Status status; - int test_flag; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Llu->Lrowind_bc_ptr[lk]; - lusup = Llu->Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; - - for (lb = 0; lb < nlb; ++lb) { - ik = lsub[lptr]; /* Global block number, row-wise. */ - nbrow = lsub[lptr+1]; -#ifdef _CRAY - SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow ); -#elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow ); -#else - hypre_F90_NAME_BLAS(dgemm,DGEMM)( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow, 1, 1 ); -#endif - stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; - - lk = LBi( ik, grid ); /* Local block number, row-wise. */ - iknsupc = SuperSize( ik ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - lptr += LB_DESCRIPTOR; - rel = xsup[ik]; /* Global row index of block ik. */ - for (i = 0; i < nbrow; ++i) { - irow = lsub[lptr++] - rel; /* Relative row. */ - RHS_ITERATE(j) - dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; - } - luptr += nbrow; - - if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ - ikcol = PCOL( ik, grid ); - p = PNUM( myrow, ikcol, grid ); - if ( iam != p ) { -#ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); -#else - MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -#endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( lk ); - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; - if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( ik, grid );/* Local block number, column-wise. */ - lsub1 = Llu->Lrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "L", "N", "U", - &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); -#endif - stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, ikcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - nlb1 = lsub1[0] - 1; - lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; - luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, - fmod, nlb1, lptr1, luptr1, xsup, - grid, Llu, send_req, stat); - } /* if frecv[lk] == 0 */ - } /* if iam == p */ - } /* if fmod[lk] == 0 */ - - } /* for lb ... */ - -} /* dLSUM_FMOD */ - - -/************************************************************************/ -void dlsum_bmod -/************************************************************************/ -( - double *lsum, /* Sum of local modifications. */ - double *x, /* X array (local). */ - double *xk, /* X[k]. */ - int nrhs, /* Number of right-hand sides. */ - int_t k, /* The k-th component of X. */ - int_t *bmod, /* Modification count for L-solve. */ - int_t *Urbs, /* Number of row blocks in each block column of U.*/ - Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ - int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ - int_t *xsup, - gridinfo_t *grid, - LocalLU_t *Llu, - MPI_Request send_req[], - SuperLUStat_t *stat - ) -{ -/* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= U_i,k * X[k]. - */ - double alpha = 1.0; - int iam, iknsupc, knsupc, myrow, nsupr, p, pi; - int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, - j, jj, lk, lk1, nub, ub, uptr; - int_t *usub; - double *uval, *dest, *y; - int_t *lsub; - double *lusup; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *brecv = Llu->brecv; - int_t **bsendx_plist = Llu->bsendx_plist; - MPI_Status status; - int test_flag; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - knsupc = SuperSize( k ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - nub = Urbs[lk]; /* Number of U blocks in block column lk */ - - for (ub = 0; ub < nub; ++ub) { - ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ - usub = Llu->Ufstnz_br_ptr[ik]; - uval = Llu->Unzval_br_ptr[ik]; - i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ - i += UB_DESCRIPTOR; - il = LSUM_BLK( ik ); - gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ - iknsupc = SuperSize( gik ); - ikfrow = FstBlockC( gik ); - iklrow = FstBlockC( gik+1 ); - - RHS_ITERATE(j) { - dest = &lsum[il + j*iknsupc]; - y = &xk[j*knsupc]; - uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ - for (jj = 0; jj < knsupc; ++jj) { - fnz = usub[i + jj]; - if ( fnz < iklrow ) { /* Nonzero segment. */ - /* AXPY */ - for (irow = fnz; irow < iklrow; ++irow) - dest[irow - ikfrow] -= uval[uptr++] * y[jj]; - stat->ops[SOLVE] += 2 * (iklrow - fnz); - } - } /* for jj ... */ - } - - if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ - gikcol = PCOL( gik, grid ); - p = PNUM( myrow, gikcol, grid ); - if ( iam != p ) { -#ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); -#else - MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -#endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( ik ); - dest = &x[ii]; - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; - if ( !brecv[ik] ) { /* Becomes a leaf node. */ - bmod[ik] = -1; /* Do not solve X[k] in the future. */ - lk1 = LBj( gik, grid ); /* Local block number. */ - lsub = Llu->Lrowind_bc_ptr[lk1]; - lusup = Llu->Lnzval_bc_ptr[lk1]; - nsupr = lsub[1]; -#ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc); -#elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "U", "N", "N", - &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc); -#endif - stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, gik); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( bsendx_plist[lk1][p] != EMPTY ) { - pi = PNUM( p, gikcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - if ( Urbs[lk1] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if brecv[ik] == 0 */ - } - } /* if bmod[ik] == 0 */ - - } /* for ub ... */ - -} /* dlSUM_BMOD */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlangs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,128 +0,0 @@ - - -/* - * File name: pdlangs.c - * History: Modified from lapack routine DLANGE - */ -#include -#include "superlu_ddefs.h" - -double pdlangs(char *norm, SuperMatrix *A, gridinfo_t *grid) -{ -/* - Purpose - ======= - - PDLANGS returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - PDLANGE returns the value - - PDLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in DLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - GRID (input) gridinof_t* - The 2D process mesh. - ===================================================================== -*/ - - /* Local variables */ - NRformat_loc *Astore; - int_t m_loc; - double *Aval; - int_t i, j, irow, jcol; - double value=0., sum; - double *rwork; - double tempvalue; - double *temprwork; - - Astore = (NRformat_loc *) A->Store; - m_loc = Astore->m_loc; - Aval = (double *) Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - value = SUPERLU_MAX( value, fabs(Aval[j]) ); - } - - MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - value = tempvalue; - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; -#if 0 - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += fabs(Aval[i]); - value = SUPERLU_MAX(value,sum); - } -#else /* XSL ==> */ - if ( !(rwork = (double *) doubleCalloc_dist(A->ncol)) ) - ABORT("doubleCalloc_dist fails for rwork."); - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - rwork[jcol] += fabs(Aval[j]); - } - } - - if ( !(temprwork = (double *) doubleCalloc_dist(A->ncol)) ) - ABORT("doubleCalloc_dist fails for temprwork."); - MPI_Allreduce(rwork, temprwork, A->ncol, MPI_DOUBLE, MPI_SUM, grid->comm); - value = 0.; - for (j = 0; j < A->ncol; ++j) { - value = SUPERLU_MAX(value, temprwork[j]); - } - SUPERLU_FREE (temprwork); - SUPERLU_FREE (rwork); -#endif - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - value = 0.; - sum = 0.; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - sum += fabs(Aval[j]); - value = SUPERLU_MAX(value, sum); - } - MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - value = tempvalue; - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else { - ABORT("Illegal norm specified."); - } - - return (value); - -} /* pdlangs */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,136 +0,0 @@ - - -/* - * File name: pdlaqgs.c - * History: Modified from LAPACK routine DLAQGE - */ -#include -#include "superlu_ddefs.h" - -void -pdlaqgs(SuperMatrix *A, double *r, double *c, - double rowcnd, double colcnd, double amax, char *equed) -{ -/* - Purpose - ======= - - PDLAQGS equilibrates a general sparse M by N matrix A using the row - and column scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - - R (input) double*, dimension (A->nrow) - The row scale factors for A. - - C (input) double*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) double - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) double - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) double - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NRformat_loc *Astore; - double *Aval; - int_t i, j, irow, jcol, m_loc; - double large, small, cj; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(char *); - /* extern double dlamch_(char *); */ - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - m_loc = Astore->m_loc; - - /* Initialize LARGE and SMALL. */ - /* small = dlamch_("Safe minimum") / dlamch_("Precision"); */ - small = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum") / hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - Aval[j] *= c[jcol]; - } - ++irow; - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - Aval[j] *= r[irow]; - ++irow; - } - *(unsigned char *)equed = 'R'; - } else { - /* Both row and column scaling */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - Aval[j] = Aval[j] * r[irow] * c[jcol]; - } - ++irow; - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* pdlaqgs */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1943 +0,0 @@ - - -/* - * -- Parallel symbolic factorization auxialiary routine (version 2.2) -- - * -- Distributes the data from parallel symbolic factorization - * -- to numeric factorization - * INRIA France - July 1, 2004 - * Laura Grigori - * - * November 1, 2007 - * Feburary 20, 2008 - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include - -#include "superlu_ddefs.h" -#include "psymbfact.h" - -static float -dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, - Glu_persist_t *Glu_persist, - int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, - gridinfo_t *grid - ) -/* - * Purpose - * ======= - * - * Redistribute the symbolic structure of L and U from the distribution - * used in the parallel symbolic factorization step to the distdibution - * used in the parallel numeric factorization step. On exit, the L and U - * structure for the 2D distribution used in the numeric factorization step is - * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal - * information is also computed and it is stored in Glu_persist->supno - * and Glu_persist->xsup. - * - * This routine allocates memory for storing the structure of L and U - * and the supernodes information. This represents the arrays: - * p_xlsub, p_lsub, p_xusub, p_usub, - * Glu_persist->supno, Glu_persist->xsup. - * - * This routine also deallocates memory allocated during symbolic - * factorization routine. That is, the folloing arrays are freed: - * Pslu_freeable->xlsub, Pslu_freeable->lsub, - * Pslu_freeable->xusub, Pslu_freeable->usub, - * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, - * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc. - * - * Arguments - * ========= - * - * n (Input) int_t - * Order of the input matrix - * Pslu_freeable (Input) Pslu_freeable_t * - * Local L and U structure, - * global to local indexing information. - * - * Glu_persist (Output) Glu_persist_t * - * Stores on output the information on supernodes mapping. - * - * p_xlsub (Output) int_t ** - * Pointer to structure of L distributed on a 2D grid - * of processors, stored by columns. - * - * p_lsub (Output) int_t ** - * Structure of L distributed on a 2D grid of processors, - * stored by columns. - * - * p_xusub (Output) int_t ** - * Pointer to structure of U distributed on a 2D grid - * of processors, stored by rows. - * - * p_usub (Output) int_t ** - * Structure of U distributed on a 2D grid of processors, - * stored by rows. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU. - * > 0, number of bytes allocated in this routine when out of memory. - * (an approximation). - */ -{ - int iam, nprocs, pc, pr, p, np, p_diag; - int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, - *tmp_ptrToSend, *mem; - int_t *nnzToRecv_l, *nnzToRecv_u; - int_t *send_1, *send_2, nsend_1, nsend_2; - int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; - int_t nsupers, nsupers_i, nsupers_j; - int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; - int_t maxszsn, maxNvtcsPProc; - int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; - int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; - int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; - int_t *xsub_s, *sub_s, *xsub_n, *sub_n; - int_t *globToLoc, nvtcs_loc; - int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, - RecvCnt_l, RecvCnt_u, ind_loc; - int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; - int_t nelts, isize; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_symbLU()"); -#endif - nprocs = (int) grid->nprow * grid->npcol; - xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; - xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = Pslu_freeable->nvtcs_loc; - xsup_beg_s = Pslu_freeable->xsup_beg_loc; - xsup_end_s = Pslu_freeable->xsup_end_loc; - supno_s = Pslu_freeable->supno_loc; - rcv_luind = NULL; - iword = sizeof(int_t); - dword = sizeof(double); - memAux = 0.; memRet = 0.; - - mem = intCalloc_dist(12 * nprocs); - if (!mem) - return (ERROR_RET); - memAux = (float) (12 * nprocs * sizeof(int_t)); - nnzToRecv = mem; - nnzToSend = nnzToRecv + 2*nprocs; - nnzToSend_l = nnzToSend + 2 * nprocs; - nnzToSend_u = nnzToSend_l + nprocs; - send_1 = nnzToSend_u + nprocs; - send_2 = send_1 + nprocs; - tmp_ptrToSend = send_2 + nprocs; - nnzToRecv_l = tmp_ptrToSend + nprocs; - nnzToRecv_u = nnzToRecv_l + nprocs; - - ptrToSend = nnzToSend; - ptrToRecv = nnzToSend + nprocs; - - nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); - intBuf1 = nvtcs + nprocs; - intBuf2 = nvtcs + 2 * nprocs; - intBuf3 = nvtcs + 3 * nprocs; - intBuf4 = nvtcs + 4 * nprocs; - memAux += 5 * nprocs * sizeof(int); - - maxszsn = sp_ienv_dist(3); - - /* Allocate space for storing Glu_persist_n. */ - if ( !(supno_n = intMalloc_dist(n+1)) ) { - fprintf (stderr, "Malloc fails for supno_n[]."); - return (memAux); - } - memRet += (float) ((n+1) * sizeof(int_t)); - - /* ------------------------------------------------------------ - DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION - ------------------------------------------------------------*/ - - if (nvtcs_loc > INT_MAX) - ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); - intNvtcs_loc = (int) nvtcs_loc; - MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, - 0, grid->comm); - - if (!iam) { - /* set ptrToRecv to point to the beginning of the data for - each processor */ - for (k = 0, p = 0; p < nprocs; p++) { - ptrToRecv[p] = k; - k += nvtcs[p]; - } - } - - if (nprocs > 1) { - temp = NULL; - if (!iam ) { - if ( !(temp = intMalloc_dist (n+1)) ) { - fprintf (stderr, "Malloc fails for temp[]."); - return (memAux + memRet); - } - memAux += (float) (n+1) * iword; - } -#if defined (_LONGINT) - for (p=0; p INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = ptrToRecv; -#endif - MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, - temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); - } - else - temp = supno_s; - - if (!iam) { - nsupers = 0; - p = (int) OWNER( globToLoc[0] ); - gb = temp[ptrToRecv[p]]; - supno_n[0] = nsupers; - ptrToRecv[p] ++; - szsn = 1; - for (j = 1; j < n; j ++) { - if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { - nsupers ++; - p = (int) OWNER( globToLoc[j] ); - gb = temp[ptrToRecv[p]]; - szsn = 1; - } - else { - szsn ++; - } - ptrToRecv[p] ++; - supno_n[j] = nsupers; - } - nsupers++; - if (nprocs > 1) { - SUPERLU_FREE (temp); - memAux -= (float) (n+1) * iword; - } - supno_n[n] = nsupers; - } - - /* reset to 0 nnzToSend */ - for (p = 0; p < 2 *nprocs; p++) - nnzToSend[p] = 0; - - MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); - nsupers = supno_n[n]; - /* Allocate space for storing Glu_persist_n. */ - if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { - fprintf (stderr, "Malloc fails for xsup_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers+1) * iword; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF L and U. - ------------------------------------------------------------*/ - gb = EMPTY; - for (i = 0; i < n; i++) { - if (gb != supno_n[i]) { - /* a new supernode starts */ - gb = supno_n[i]; - xsup_n[gb] = i; - } - } - xsup_n[nsupers] = n; - - for (p = 0; p < nprocs; p++) { - send_1[p] = FALSE; - send_2[p] = FALSE; - } - for (gb_n = 0; gb_n < nsupers; gb_n ++) { - i = xsup_n[gb_n]; - if (iam == (int) OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { - k = lsub_s[j]; - if (k >= i) { - gb = supno_n[k]; - p = (int) PNUM( PROW(gb, grid), pc, grid ); - nnzToSend[2*p] ++; - send_1[p] = TRUE; - } - } - for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { - k = usub_s[j]; - if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { - gb = supno_n[k]; - p = PNUM( pr, PCOL(gb, grid), grid); - nnzToSend[2*p+1] ++; - send_2[p] = TRUE; - } - } - - nsend_2 = 0; - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - nnzToSend[2*p+1] += 2; - if (send_2[p]) nsend_2 ++; - } - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) - if (send_2[p] || p == p_diag) { - if (p == p_diag && !send_2[p]) - nnzToSend[2*p+1] += nsend_2; - else - nnzToSend[2*p+1] += nsend_2-1; - send_2[p] = FALSE; - } - nsend_1 = 0; - for (p = pc; p < nprocs; p += grid->npcol) { - nnzToSend[2*p] += 2; - if (send_1[p]) nsend_1 ++; - } - for (p = pc; p < nprocs; p += grid->npcol) - if (send_1[p]) { - nnzToSend[2*p] += nsend_1-1; - send_1[p] = FALSE; - } - else - nnzToSend[2*p] += nsend_1; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, - grid->comm); - - nnz_loc_l = nnz_loc_u = 0; - SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; - for (p = 0; p < nprocs; p++) { - if ( p != iam ) { - SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; - SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; - RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; - RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } else { - nnz_loc_l += nnzToRecv[2*p]; - nnz_loc_u += nnzToRecv[2*p+1]; - nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; - nnzToRecv_l[p] = nnzToRecv[2*p]; - nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } - } - - /* Allocate space for storing the symbolic structure after redistribution. */ - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for xlsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_j+1) * iword; - - if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for xusub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_i+1) * iword; - - /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ - if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { - if (!(rcv_luind = - intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { - fprintf (stderr, "Malloc fails for rcv_luind[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) - * iword; - } - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { - fprintf (stderr, "Malloc fails for index[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------------ - LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF L and U. - ------------------------------------------------------------------*/ - sendL = TRUE; - sendU = FALSE; - while (sendL || sendU) { - if (sendL) { - xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; - nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; - } - if (sendU) { - xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; - nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; - } - for (i = 0, j = 0, p = 0; p < nprocs; p++) { - if ( p != iam ) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - ptrToRecv[p] = j; j += nnzToRecv[p]; - } - nnzToRecv[iam] = 0; - - ind_loc = ptrToRecv[iam]; - for (gb_n = 0; gb_n < nsupers; gb_n++) { - nsend_2 = 0; - i = xsup_n[gb_n]; - if (iam == OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid ); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - - if (sendL) { - p = pc; np = grid->nprow; - } else { - p = pr * grid->npcol; np = grid->npcol; - } - for (j = 0; j < np; j++) { - if (p == iam) { - rcv_luind[ind_loc] = gb_n; - rcv_luind[ind_loc+1] = 0; - tmp_ptrToSend[p] = ind_loc + 1; - ind_loc += 2; - } - else { - snd_luind[ptrToSend[p]] = gb_n; - snd_luind[ptrToSend[p]+1] = 0; - tmp_ptrToSend[p] = ptrToSend[p] + 1; - ptrToSend[p] += 2; - } - if (sendL) p += grid->npcol; - if (sendU) p++; - } - for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { - k = sub_s[j]; - if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { - gb = supno_n[k]; - if (sendL) - p = PNUM( PROW(gb, grid), pc, grid ); - else - p = PNUM( pr, PCOL(gb, grid), grid); - if (send_1[p] == FALSE) { - send_1[p] = TRUE; - send_2[nsend_2] = k; nsend_2 ++; - } - if (p == iam) { - rcv_luind[ind_loc] = k; ind_loc++; - if (sendL) - xsub_n[LBj( gb_n, grid )] ++; - else - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = k; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - if (sendL) - for (p = pc; p < nprocs; p += grid->npcol) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if (PNUM(PROW(gb, grid), pc, grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBj( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - if (sendU) - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - if (send_1[p] || p == p_diag) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if(PNUM( pr, PCOL(gb, grid), grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - } - } - } - - /* reset ptrToSnd to point to the beginning of the data for - each processor (structure needed in MPI_Alltoallv) */ - for (i = 0, p = 0; p < nprocs; p++) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - Note: it uses MPI_Alltoallv. - ------------------------------------------------------------*/ - if (nprocs > 1) { -#if defined (_LONGINT) - nnzToSend[iam] = 0; - for (p=0; p INT_MAX || ptrToSend[p] > INT_MAX || - nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) nnzToSend[p]; - intBuf2[p] = (int) ptrToSend[p]; - intBuf3[p] = (int) nnzToRecv[p]; - intBuf4[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = nnzToSend; intBuf2 = ptrToSend; - intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; -#endif - - MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, - rcv_luind, intBuf3, intBuf4, mpi_int_t, - grid->comm); - } - if (sendL) - nnzToRecv[iam] = nnz_loc_l; - else - nnzToRecv[iam] = nnz_loc_u; - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE. - -------------------------------------------------------------*/ - if (sendU) - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - SUPERLU_FREE (snd_luind); - memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------ - CONVERT THE FORMAT. - ------------------------------------------------------------*/ - /* Initialize the array of column of L/ row of U pointers */ - k = 0; - for (p = 0; p < nprocs; p ++) { - if (p != iam) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - nelts = rcv_luind[i+1]; - if (sendL) - xsub_n[LBj( gb, grid )] = nelts; - else - xsub_n[LBi( gb, grid )] = nelts; - i += nelts + 2; - } - } - k += nnzToRecv[p]; - } - - if (sendL) j = nsupers_j; - else j = nsupers_i; - k = 0; - isize = xsub_n[0]; - xsub_n[0] = 0; - for (gb_l = 1; gb_l < j; gb_l++) { - k += isize; - isize = xsub_n[gb_l]; - xsub_n[gb_l] = k; - } - xsub_n[gb_l] = k + isize; - nnz_loc = xsub_n[gb_l]; - if (sendL) { - lsub_n = NULL; - if (nnz_loc) { - if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for lsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = lsub_n; - } - if (sendU) { - usub_n = NULL; - if (nnz_loc) { - if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for usub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = usub_n; - } - - /* Copy the data into the L column / U row oriented storage */ - k = 0; - for (p = 0; p < nprocs; p++) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - if (gb >= nsupers) - printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n", - iam, p, gb, nsupers, i, i-k); - i += 2; - if (sendL) gb_l = LBj( gb, grid ); - if (sendU) gb_l = LBi( gb, grid ); - for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { - sub_n[j] = rcv_luind[i]; - } - } - k += nnzToRecv[p]; - } - if (sendL) { - sendL = FALSE; sendU = TRUE; - } - else - sendU = FALSE; - } - - /* deallocate memory allocated during symbolic factorization routine */ - if (rcv_luind != NULL) { - SUPERLU_FREE (rcv_luind); - memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; - } - SUPERLU_FREE (mem); - memAux -= (float) (12 * nprocs * iword); - SUPERLU_FREE(nvtcs); - memAux -= (float) (5 * nprocs * sizeof(int)); - - if (xlsub_s != NULL) { - SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); - } - if (xusub_s != NULL) { - SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); - } - SUPERLU_FREE (globToLoc); - if (supno_s != NULL) { - SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); - SUPERLU_FREE (supno_s); - } - - Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; - *p_xlsub = xlsub_n; *p_lsub = lsub_n; - *p_xusub = xusub_n; *p_usub = usub_n; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dist_symbLU()"); -#endif - - return (-memRet); -} - -static float -ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val, - int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val, - int_t *ilsum_i, int_t *ilsum_j - ) -{ -/* - * - * Purpose - * ======= - * Re-distribute A on the 2D process mesh. The lower part is - * stored using a column format and the upper part - * is stored using a row format. - * - * Arguments - * ========= - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_persist (Input) Glu_persist_t * - * Information on supernodes mapping. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * p_ainf_colptr (Output) int_t** - * Pointer to the lower part of A distributed on a 2D grid - * of processors, stored by columns. - * - * p_ainf_rowind (Output) int_t** - * Structure of of the lower part of A distributed on a - * 2D grid of processors, stored by columns. - * - * p_ainf_val (Output) double** - * Numerical values of the lower part of A, distributed on a - * 2D grid of processors, stored by columns. - * - * p_asup_rowptr (Output) int_t** - * Pointer to the upper part of A distributed on a 2D grid - * of processors, stored by rows. - * - * p_asup_colind (Output) int_t** - * Structure of of the upper part of A distributed on a - * 2D grid of processors, stored by rows. - * - * p_asup_val (Output) double** - * Numerical values of the upper part of A, distributed on a - * 2D grid of processors, stored by rows. - * - * ilsum_i (Input) int_t * - * Starting position of each supernode in - * the full array (local, block row wise). - * - * ilsum_j (Input) int_t * - * Starting position of each supernode in - * the full array (local, block column wise). - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated when out of memory. - * (an approximation). - * - */ - int iam, p, procs; - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; - int_t nsupers, nsupers_i, nsupers_j; - int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - double *asup_val, *ainf_val; - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - double *aij, **aij_send, *nzval, *dtemp; - double *nzval_a; - MPI_Request *send_req; - MPI_Status status; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword, szbuf; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter ddist_A()"); -#endif - iword = sizeof(int_t); - dword = sizeof(double); - - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - if (!(nnzToRecv = intCalloc_dist(2*procs))) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (ERROR_RET); - } - memAux = (float) (2 * procs * iword); - memRet = 0.; - nnzToSend = nnzToRecv + procs; - nsupers = supno[n-1] + 1; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); - - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - szbuf = k; - - /* Allocate space for storing the triplets after redistribution. */ - if ( !(ia = intMalloc_dist(2*k)) ) { - fprintf (stderr, "Malloc fails for ia[]."); - return (memAux); - } - memAux += (float) (2*k*iword); - ja = ia + k; - if ( !(aij = doubleMalloc_dist(k)) ) { - fprintf (stderr, "Malloc fails for aij[]."); - return (memAux); - } - memAux += (float) (k*dword); - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { - fprintf (stderr, "Malloc fails for send_req[]."); - return (memAux); - } - memAux += (float) (2*procs *sizeof(MPI_Request)); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ia_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(int_t*)); - if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for aij_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(double*)); - if ( !(index = intMalloc_dist(2*SendCnt)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memAux); - } - memAux += (float) (2*SendCnt*iword); - if ( !(nzval = doubleMalloc_dist(SendCnt)) ) { - fprintf(stderr, "Malloc fails for nzval[]."); - return (memAux); - } - memAux += (float) (SendCnt * dword); - if ( !(ptr_to_send = intCalloc_dist(procs)) ) { - fprintf(stderr, "Malloc fails for ptr_to_send[]."); - return (memAux); - } - memAux += (float) (procs * iword); - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for itemp[]."); - return (memAux); - } - memAux += (float) (2*maxnnzToRecv*iword); - if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for dtemp[]."); - return (memAux); - } - memAux += (float) (maxnnzToRecv * dword); - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { - fprintf (stderr, "Malloc fails for *ainf_colptr[]."); - return (memAux); - } - memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; - if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { - fprintf (stderr, "Malloc fails for *asup_rowptr[]."); - return (memAux+memRet); - } - memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nnz_loc_ainf = nnz_loc_asup = 0; - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, MPI_DOUBLE, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - irow = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /* assert(jcol= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - MPI_Wait( &send_req[p], &status); - MPI_Wait( &send_req[procs+p], &status); - } - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - - SUPERLU_FREE(nnzToRecv); - memAux -= 2 * procs * iword; - if ( procs > 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - SUPERLU_FREE(ptr_to_send); - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + - procs*sizeof(double*) + 2*SendCnt * iword + - SendCnt* dword + procs*iword + - 2*maxnnzToRecv*iword + maxnnzToRecv*dword; - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT. - ------------------------------------------------------------*/ - if (nnz_loc_ainf != 0) { - if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_rowind[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * iword); - if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_val[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * dword); - } - else { - ainf_rowind = NULL; - ainf_val = NULL; - } - if (nnz_loc_asup != 0) { - if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_colind[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * iword); - if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_val[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * dword); - } - else { - asup_colind = NULL; - asup_val = NULL; - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = ainf_colptr[0]; ainf_colptr[0] = 0; - for (j = 1; j < ilsum_j[nsupers_j]; j++) { - k += jsize; - jsize = ainf_colptr[j]; - ainf_colptr[j] = k; - } - ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; - i = 0; - isize = asup_rowptr[0]; asup_rowptr[0] = 0; - for (j = 1; j < ilsum_i[nsupers_i]; j++) { - i += isize; - isize = asup_rowptr[j]; - asup_rowptr[j] = i; - } - asup_rowptr[ilsum_i[nsupers_i]] = i + isize; - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - jcol = ja[i]; - irow = ia[i]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); - k = ainf_colptr[j]; - ainf_rowind[k] = irow; - ainf_val[k] = aij[i]; - ainf_colptr[j] ++; - } - else { - j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); - k = asup_rowptr[j]; - asup_colind[k] = jcol; - asup_val[k] = aij[i]; - asup_rowptr[j] ++; - } - } - - /* Reset the column pointers to the beginning of each column */ - for (j = ilsum_j[nsupers_j]; j > 0; j--) - ainf_colptr[j] = ainf_colptr[j-1]; - for (j = ilsum_i[nsupers_i]; j > 0; j--) - asup_rowptr[j] = asup_rowptr[j-1]; - ainf_colptr[0] = 0; - asup_rowptr[0] = 0; - - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - memAux -= 2*szbuf*iword + szbuf*dword; - - *p_ainf_colptr = ainf_colptr; - *p_ainf_rowind = ainf_rowind; - *p_ainf_val = ainf_val; - *p_asup_rowptr = asup_rowptr; - *p_asup_colind = asup_colind; - *p_asup_val = asup_val; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit ddist_A()"); - fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); -#endif - - return (-memRet); -} /* dist_A */ - -int_t -ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Pslu_freeable_t *Pslu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * - * - * Purpose - * ======= - * Distribute the input matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * This routine should not be called for this case, an error - * is generated. Instead, pddistribute routine should be called. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (Input) int - * Dimension of the matrix. - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. - * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (Input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (Input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated for performing the distribution - * of the data, when out of memory. - * (an approximation). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t Glu_freeable_n; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, - len, len1, nsupc, nsupc_gb, ii, nprocs; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - double *a; - int_t *asub, *xa; - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - double *asup_val, *ainf_val; - int_t *xsup, *supno; /* supernode and column mapping */ - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; - int_t next_ind; /* next available position in index[*] */ - int_t next_val; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - double *lusup, *uval; /* nonzero values in L and U */ - int_t *recvBuf; - int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in - the full array (local, block column wise) */ - /*-- Auxiliary arrays; freed on return --*/ - int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *LUb_length; /* L,U block length; size nsupers_ij */ - int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ - int_t *LUb_number; /* global block number; size nsupers_ij */ - int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ - int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; - float memStrLU, memA, - memDist = 0.; /* memory used for redistributing the data, which does - not include the memory for the numerical values of L and U */ - float memNLU = 0.; /* memory allocated for storing the numerical values of - L and U, that will be used in the numeric factorization */ - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif - - /* Initialization. */ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); -#endif - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nprocs = grid->npcol * grid->nprow; - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - Astore = (NRformat_loc *) A->Store; - - iword = sizeof(int_t); - dword = sizeof(double); - - if (fact == SamePattern_SameRowPerm) { - ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); - } - - if ((memStrLU = - dist_symbLU (n, Pslu_freeable, - Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) - return (memStrLU); - memDist += (-memStrLU); - xsup = Glu_persist->xsup; /* supernode and column mapping */ - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ - nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ - nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); - if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for ilsum[]."); - return (memDist + memNLU); - } - memNLU += (nsupers_i+1) * iword; - if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for ilsum_j[]."); - return (memDist + memNLU); - } - memDist += (nsupers_j+1) * iword; - - /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ - ilsum[0] = 0; - ldaspa = 0; - for (gb = 0; gb < nsupers; gb++) - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - ilsum[nsupers_i] = ldaspa; - - ldaspa_j = 0; ilsum_j[0] = 0; - for (gb = 0; gb < nsupers; gb++) - if (mycol == PCOL( gb, grid )) { - i = SuperSize( gb ); - ldaspa_j += i; - lb = LBj( gb, grid ); - ilsum_j[lb + 1] = ilsum_j[lb] + i; - } - ilsum_j[nsupers_j] = ldaspa_j; - - if ((memA = ddist_A(A, ScalePermstruct, Glu_persist, - grid, &ainf_colptr, &ainf_rowind, &ainf_val, - &asup_rowptr, &asup_colind, &asup_val, - ilsum, ilsum_j)) > 0) - return (memDist + memA + memNLU); - memDist += (-memA); - - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ - - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - if ( !(ToRecv = intCalloc_dist(nsupers)) ) { - fprintf(stderr, "Calloc fails for ToRecv[]."); - return (memDist + memNLU); - } - memNLU += nsupers * iword; - - k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ToSendR[]."); - return (memDist + memNLU); - } - memNLU += k*sizeof(int_t*); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memDist + memNLU); - } - memNLU += j*iword; - - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - - /* Auxiliary arrays used to set up L and U block data structures. - They are freed on return. */ - if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_length[]."); - return (memDist + memNLU); - } - if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Malloc fails for LUb_indptr[]."); - return (memDist + memNLU); - } - if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_number[]."); - return (memDist + memNLU); - } - if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_valptr[]."); - return (memDist + memNLU); - } - memDist += 4 * nsupers_ij * iword; - - k = CEILING( nsupers, grid->nprow ); - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); - return (memDist + memNLU); - } - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*); - Unzval_br_ptr[nsupers_i-1] = NULL; - Ufstnz_br_ptr[nsupers_i-1] = NULL; - - if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Malloc fails for ToSendD[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*iword; - if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - memDist += (nsupers_i + nsupers_j)*iword; - - /* Auxiliary arrays used to set up L, U block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) - * sp_ienv_dist(3))) ) { - fprintf(stderr, "Calloc fails for SPA dense[]."); - return (memDist + memNLU); - } - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for fmod[]."); - return (memDist + memNLU); - } - if ( !(bmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for bmod[]."); - return (memDist + memNLU); - } - /* ------------------------------------------------ */ - memNLU += 2*nsupers_i*iword + - SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); - return (memDist + memNLU); - } - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*); - Lnzval_bc_ptr[nsupers_j-1] = NULL; - Lrowind_bc_ptr[nsupers_j-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[]."); - return (memDist + memNLU); - } - len = nsupers_j * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[]."); - return (memDist + memNLU); - } - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ - memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid); /* Local block number row wise */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - if ( myrow == jbrow ) { /* Block row jb in my process row */ - /* Scatter A into SPA. */ - for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { - for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { - if (i >= asup_rowptr[ilsum[nsupers_i]]) - printf ("ERR7\n"); - jcol = asup_colind[i]; - if (jcol >= n) - printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - gb = BlockNum( jcol ); - lb = LBj( gb, grid ); - if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); - jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); - if (jcol >= ldaspa_j) - printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - dense_col[jcol] = asup_val[i]; - } - dense_col += ldaspa_j; - } - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - /* Count number of blocks and length of each block. */ - nrbu = 0; - len = 0; /* Number of column subscripts I own. */ - len1 = 0; /* number of fstnz subscripts */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - if (i >= xusub[nsupers_i]) printf ("ERR10\n"); - jcol = usub[i]; - gb = BlockNum( jcol ); /* Global block number */ - - /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) - printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", - iam, jb, gb, jcol, jbcol, pc); */ - - lb = LBj( gb, grid ); /* Local block number */ - pc = PCOL( gb, grid ); /* Process col owning this block */ - if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; - /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ - pr = PROW( gb, grid ); - if ( pr != jbrow && mycol == pc) - bsendx_plist[lb][jbrow] = YES; - if (mycol == pc) { - len += nsupc; - LUb_length[lb] += nsupc; - ToSendD[ljb_i] = YES; - if (Urb_marker[lb] <= jb) { /* First see this block */ - if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; - Urb_marker[lb] = jb + 1; - LUb_number[nrbu] = gb; - /* if (gb == 391825 && jb == 145361) - printf ("Pe[%d] T1 [%d %d] nrbu %d \n", - iam, jb, gb, nrbu); */ - nrbu ++; - len1 += SuperSize( gb ); - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[ljb_i];/* Mod. count for back solve */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - } - } /* for i ... */ - - if ( nrbu ) { - /* Sort the blocks of U in increasing block column index. - SuperLU_DIST assumes this is true */ - /* simple insert sort algorithm */ - /* to be transformed in quick sort */ - for (j = 1; j < nrbu; j++) { - k = LUb_number[j]; - for (i=j-1; i>=0 && LUb_number[i] > k; i--) { - LUb_number[i+1] = LUb_number[i]; - } - LUb_number[i+1] = k; - } - - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) { - fprintf (stderr, "Malloc fails for Uindex[]"); - return (memDist + memNLU); - } - Ufstnz_br_ptr[ljb_i] = index; - if (!(Unzval_br_ptr[ljb_i] = - doubleMalloc_dist(len))) { - fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); - return (memDist + memNLU); - } - memNLU += (len1+1)*iword + len*dword; - uval = Unzval_br_ptr[ljb_i]; - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = nrbu; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index */ - index[len1] = -1; /* End marker */ - next_ind = BR_HEADER; - next_val = 0; - for (k = 0; k < nrbu; k++) { - gb = LUb_number[k]; - lb = LBj( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; /* Reset vector of block length */ - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) - index[next_ind] = FstBlockC( jb + 1 ); - LUb_valptr[lb] = next_val; - next_val += len; - } - /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], - and the initial values of A from SPA into Unzval_br_ptr[]. */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - jcol = usub[i]; - gb = BlockNum( jcol ); - - if ( mycol == PCOL( gb, grid ) ) { - lb = LBj( gb, grid ); - k = LUb_indptr[lb]; /* Start fstnz in index */ - index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); - } - } /* for i ... */ - - for (i = 0; i < nrbu; i++) { - gb = LUb_number[i]; - lb = LBj( gb, grid ); - next_ind = LUb_indptr[lb]; - k = FstBlockC( jb + 1); - jcol = ilsum_j[lb]; - for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { - dense_col = dense; - j = index[next_ind+jj]; - for (ii = j; ii < k; ii++) { - uval[LUb_valptr[lb]++] = dense_col[jcol]; - dense_col[jcol] = zero; - dense_col += ldaspa_j; - } - } - } - } else { - Ufstnz_br_ptr[ljb_i] = NULL; - Unzval_br_ptr[ljb_i] = NULL; - } /* if nrbu ... */ - } /* if myrow == jbrow */ - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - if (mycol == jbcol) { /* Block column jb in my process column */ - /* Scatter A_inf into SPA. */ - for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { - for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { - irow = ainf_rowind[i]; - if (irow >= n) printf ("Pe[%d] ERR1\n", iam); - gb = BlockNum( irow ); - if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); - dense_col[irow] = ainf_val[i]; - } - } - dense_col += ldaspa; - } - - /* sort the indices of the diagonal block at the beginning of xlsub */ - if (myrow == jbrow) { - k = xlsub[ljb_j]; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - if (irow < nsupc + fsupc && i != k+irow-fsupc) { - lsub[i] = lsub[k + irow - fsupc]; - lsub[k + irow - fsupc] = irow; - i --; - } - } - } - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && - myrow == jbrow) { - fsendx_plist[ljb_j][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (Lrb_marker[lb] <= jb) { /* First see this block */ - Lrb_marker[lb] = jb + 1; - LUb_length[lb] = 1; - LUb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else - ++LUb_length[lb]; - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* If I am the owner of the diagonal block, order it first in LUb_number. - Necessary for SuperLU_DIST routines */ - kseen = EMPTY; - for (j = 0; j < nrbl; j++) { - if (LUb_number[j] == jb) - kseen = j; - } - if (kseen != EMPTY && kseen != 0) { - LUb_number[kseen] = LUb_number[0]; - LUb_number[0] = jb; - } - - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) { - fprintf (stderr, "Malloc fails for index[]"); - return (memDist + memNLU); - } - Lrowind_bc_ptr[ljb_j] = index; - if (!(Lnzval_bc_ptr[ljb_j] = - doubleMalloc_dist(len*nsupc))) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb); - return (memDist + memNLU); - } - memNLU += len1*iword + len*nsupc*dword; - - lusup = Lnzval_bc_ptr[ljb_j]; - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_ind = BC_HEADER; - next_val = 0; - for (k = 0; k < nrbl; ++k) { - gb = LUb_number[k]; - lb = LBi( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - LUb_valptr[lb] = next_val; - next_ind += len; - next_val += len; - } - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - len = index[1]; /* LDA of lusup[] */ - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = LUb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = LUb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb_j] = NULL; - Lnzval_bc_ptr[ljb_j] = NULL; - } /* if nrbl ... */ - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(ilsum_j); - SUPERLU_FREE(Urb_marker); - SUPERLU_FREE(LUb_length); - SUPERLU_FREE(LUb_indptr); - SUPERLU_FREE(LUb_number); - SUPERLU_FREE(LUb_valptr); - SUPERLU_FREE(Lrb_marker); - SUPERLU_FREE(dense); - - /* Free the memory used for storing L and U */ - SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); - if (lsub != NULL) - SUPERLU_FREE(lsub); - if (usub != NULL) - SUPERLU_FREE(usub); - - /* Free the memory used for storing A */ - SUPERLU_FREE(ainf_colptr); - if (ainf_rowind != NULL) { - SUPERLU_FREE(ainf_rowind); - SUPERLU_FREE(ainf_val); - } - SUPERLU_FREE(asup_rowptr); - if (asup_colind != NULL) { - SUPERLU_FREE(asup_colind); - SUPERLU_FREE(asup_val); - } - - /* exchange information about bsendx_plist in between column of processors */ - k = SUPERLU_MAX( grid->nprow, grid->npcol); - if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { - fprintf (stderr, "Malloc fails for recvBuf[]."); - return (memDist + memNLU); - } - if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - - if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) - memDist = nsupers*k*iword +4*nprocs * sizeof(int); - - for (p = 0; p < nprocs; p++) - nnzToRecv[p] = 0; - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - nnzToRecv[p] += grid->npcol; - } - i = 0; - for (p = 0; p < nprocs; p++) { - ptrToRecv[p] = i; - i += nnzToRecv[p]; - ptrToSend[p] = 0; - if (p != iam) - nnzToSend[p] = nnzToRecv[iam]; - else - nnzToSend[p] = 0; - } - nnzToRecv[iam] = 0; - i = ptrToRecv[iam]; - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - if (p == iam) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - for (j = 0; j < grid->npcol; j++, i++) - recvBuf[i] = ToSendR[ljb_j][j]; - } - } - - MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, - recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid ); /* Local block number row wise */ - /* (myrow == jbrow) { - if (ToSendD[ljb_i] == YES) - ToRecv[jb] = 1; - } - else { - if (recvBuf[ptrToRecv[p] + mycol] == YES) - ToRecv[jb] = 2; - } */ - if (recvBuf[ptrToRecv[p] + mycol] == YES) { - if (myrow == jbrow) - ToRecv[jb] = 1; - else - ToRecv[jb] = 2; - } - if (mycol == jbcol) { - for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) - ToSendR[ljb_j][i] = recvBuf[j]; - ToSendR[ljb_j][mycol] = EMPTY; - } - ptrToRecv[p] += grid->npcol; - } - - /* exchange information about bsendx_plist in between column of processors */ - MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, - MPI_MAX, grid->cscp.comm); - - for (jb = 0; jb < nsupers; jb ++) { - jbcol = PCOL( jb, grid); - jbrow = PROW( jb, grid); - if (mycol == jbcol) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - if (myrow == jbrow ) { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { - (*bsendx_plist)[k] = recvBuf[k]; - if ((*bsendx_plist)[k] != EMPTY) - nbsendx ++; - } - } - else { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) - (*bsendx_plist)[k] = EMPTY; - } - } - } - - SUPERLU_FREE(nnzToRecv); - SUPERLU_FREE(ptrToRecv); - SUPERLU_FREE(nnzToSend); - SUPERLU_FREE(ptrToSend); - SUPERLU_FREE(recvBuf); - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - LUstruct->Glu_persist = Glu_persist; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist, - ToRecv, ToSendR, ToSendD - */ - CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); -#endif - - return (- (memDist+memNLU)); -} /* dist_psymbtonum */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,1940 +0,0 @@ - - -/* - * -- Parallel symbolic factorization auxialiary routine (version 2.1) -- - * -- Distributes the data from parallel symbolic factorization - * -- to numeric factorization - * INRIA France - July 1, 2004 - * Laura Grigori - * - * November 1, 2007 - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include - -#include "superlu_ddefs.h" - -static float -dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, - Glu_persist_t *Glu_persist, - int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, - gridinfo_t *grid - ) -/* - * Purpose - * ======= - * - * Redistribute the symbolic structure of L and U from the distribution - * used in the parallel symbolic factorization step to the distdibution - * used in the parallel numeric factorization step. On exit, the L and U - * structure for the 2D distribution used in the numeric factorization step is - * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal - * information is also computed and it is stored in Glu_persist->supno - * and Glu_persist->xsup. - * - * This routine allocates memory for storing the structure of L and U - * and the supernodes information. This represents the arrays: - * p_xlsub, p_lsub, p_xusub, p_usub, - * Glu_persist->supno, Glu_persist->xsup. - * - * This routine also deallocates memory allocated during symbolic - * factorization routine. That is, the folloing arrays are freed: - * Pslu_freeable->xlsub, Pslu_freeable->lsub, - * Pslu_freeable->xusub, Pslu_freeable->usub, - * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, - * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc. - * - * Arguments - * ========= - * - * n (Input) int_t - * Order of the input matrix - * Pslu_freeable (Input) Pslu_freeable_t * - * Local L and U structure, - * global to local indexing information. - * - * Glu_persist (Output) Glu_persist_t * - * Stores on output the information on supernodes mapping. - * - * p_xlsub (Output) int_t ** - * Pointer to structure of L distributed on a 2D grid - * of processors, stored by columns. - * - * p_lsub (Output) int_t ** - * Structure of L distributed on a 2D grid of processors, - * stored by columns. - * - * p_xusub (Output) int_t ** - * Pointer to structure of U distributed on a 2D grid - * of processors, stored by rows. - * - * p_usub (Output) int_t ** - * Structure of U distributed on a 2D grid of processors, - * stored by rows. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU. - * > 0, number of bytes allocated in this routine when out of memory. - * (an approximation). - */ -{ - int iam, nprocs, pc, pr, p, np, p_diag; - int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, - *tmp_ptrToSend, *mem; - int_t *nnzToRecv_l, *nnzToRecv_u; - int_t *send_1, *send_2, nsend_1, nsend_2; - int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; - int_t nsupers, nsupers_i, nsupers_j; - int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; - int_t maxszsn, maxNvtcsPProc; - int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; - int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; - int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; - int_t *xsub_s, *sub_s, *xsub_n, *sub_n; - int_t *globToLoc, nvtcs_loc; - int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, - RecvCnt_l, RecvCnt_u, ind_loc; - int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; - int_t nelts, isize; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_symbLU()"); -#endif - nprocs = (int) grid->nprow * grid->npcol; - xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; - xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = Pslu_freeable->nvtcs_loc; - xsup_beg_s = Pslu_freeable->xsup_beg_loc; - xsup_end_s = Pslu_freeable->xsup_end_loc; - supno_s = Pslu_freeable->supno_loc; - rcv_luind = NULL; - iword = sizeof(int_t); - dword = sizeof(double); - memAux = 0.; memRet = 0.; - - mem = intCalloc_dist(12 * nprocs); - if (!mem) - return (ERROR_RET); - memAux = (float) (12 * nprocs * sizeof(int_t)); - nnzToRecv = mem; - nnzToSend = nnzToRecv + 2*nprocs; - nnzToSend_l = nnzToSend + 2 * nprocs; - nnzToSend_u = nnzToSend_l + nprocs; - send_1 = nnzToSend_u + nprocs; - send_2 = send_1 + nprocs; - tmp_ptrToSend = send_2 + nprocs; - nnzToRecv_l = tmp_ptrToSend + nprocs; - nnzToRecv_u = nnzToRecv_l + nprocs; - - ptrToSend = nnzToSend; - ptrToRecv = nnzToSend + nprocs; - - nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); - intBuf1 = nvtcs + nprocs; - intBuf2 = nvtcs + 2 * nprocs; - intBuf3 = nvtcs + 3 * nprocs; - intBuf4 = nvtcs + 4 * nprocs; - memAux += 5 * nprocs * sizeof(int); - - maxszsn = sp_ienv_dist(3); - - /* Allocate space for storing Glu_persist_n. */ - if ( !(supno_n = intMalloc_dist(n+1)) ) { - fprintf (stderr, "Malloc fails for supno_n[]."); - return (memAux); - } - memRet += (float) ((n+1) * sizeof(int_t)); - - /* ------------------------------------------------------------ - DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION - ------------------------------------------------------------*/ - - if (nvtcs_loc > INT_MAX) - ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); - intNvtcs_loc = (int) nvtcs_loc; - MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, - 0, grid->comm); - - if (!iam) { - /* set ptrToRecv to point to the beginning of the data for - each processor */ - for (k = 0, p = 0; p < nprocs; p++) { - ptrToRecv[p] = k; - k += nvtcs[p]; - } - } - - if (nprocs > 1) { - temp = NULL; - if (!iam ) { - if ( !(temp = intMalloc_dist (n+1)) ) { - fprintf (stderr, "Malloc fails for temp[]."); - return (memAux + memRet); - } - memAux += (float) (n+1) * iword; - } -#if defined (_LONGINT) - for (p=0; p INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = ptrToRecv; -#endif - MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, - temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); - } - else - temp = supno_s; - - if (!iam) { - nsupers = 0; - p = (int) OWNER( globToLoc[0] ); - gb = temp[ptrToRecv[p]]; - supno_n[0] = nsupers; - ptrToRecv[p] ++; - szsn = 1; - for (j = 1; j < n; j ++) { - if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { - nsupers ++; - p = (int) OWNER( globToLoc[j] ); - gb = temp[ptrToRecv[p]]; - szsn = 1; - } - else { - szsn ++; - } - ptrToRecv[p] ++; - supno_n[j] = nsupers; - } - nsupers++; - if (nprocs > 1) { - SUPERLU_FREE (temp); - memAux -= (float) (n+1) * iword; - } - supno_n[n] = nsupers; - } - - /* reset to 0 nnzToSend */ - for (p = 0; p < 2 *nprocs; p++) - nnzToSend[p] = 0; - - MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); - nsupers = supno_n[n]; - /* Allocate space for storing Glu_persist_n. */ - if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { - fprintf (stderr, "Malloc fails for xsup_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers+1) * iword; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF L and U. - ------------------------------------------------------------*/ - gb = EMPTY; - for (i = 0; i < n; i++) { - if (gb != supno_n[i]) { - /* a new supernode starts */ - gb = supno_n[i]; - xsup_n[gb] = i; - } - } - xsup_n[nsupers] = n; - - for (p = 0; p < nprocs; p++) { - send_1[p] = FALSE; - send_2[p] = FALSE; - } - for (gb_n = 0; gb_n < nsupers; gb_n ++) { - i = xsup_n[gb_n]; - if (iam == (int) OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { - k = lsub_s[j]; - if (k >= i) { - gb = supno_n[k]; - p = (int) PNUM( PROW(gb, grid), pc, grid ); - nnzToSend[2*p] ++; - send_1[p] = TRUE; - } - } - for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { - k = usub_s[j]; - if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { - gb = supno_n[k]; - p = PNUM( pr, PCOL(gb, grid), grid); - nnzToSend[2*p+1] ++; - send_2[p] = TRUE; - } - } - - nsend_2 = 0; - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - nnzToSend[2*p+1] += 2; - if (send_2[p]) nsend_2 ++; - } - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) - if (send_2[p] || p == p_diag) { - if (p == p_diag && !send_2[p]) - nnzToSend[2*p+1] += nsend_2; - else - nnzToSend[2*p+1] += nsend_2-1; - send_2[p] = FALSE; - } - nsend_1 = 0; - for (p = pc; p < nprocs; p += grid->npcol) { - nnzToSend[2*p] += 2; - if (send_1[p]) nsend_1 ++; - } - for (p = pc; p < nprocs; p += grid->npcol) - if (send_1[p]) { - nnzToSend[2*p] += nsend_1-1; - send_1[p] = FALSE; - } - else - nnzToSend[2*p] += nsend_1; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, - grid->comm); - - nnz_loc_l = nnz_loc_u = 0; - SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; - for (p = 0; p < nprocs; p++) { - if ( p != iam ) { - SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; - SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; - RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; - RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } else { - nnz_loc_l += nnzToRecv[2*p]; - nnz_loc_u += nnzToRecv[2*p+1]; - nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; - nnzToRecv_l[p] = nnzToRecv[2*p]; - nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } - } - - /* Allocate space for storing the symbolic structure after redistribution. */ - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for xlsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_j+1) * iword; - - if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for xusub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_i+1) * iword; - - /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ - if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { - if (!(rcv_luind = - intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { - fprintf (stderr, "Malloc fails for rcv_luind[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) - * iword; - } - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { - fprintf (stderr, "Malloc fails for index[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------------ - LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF L and U. - ------------------------------------------------------------------*/ - sendL = TRUE; - sendU = FALSE; - while (sendL || sendU) { - if (sendL) { - xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; - nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; - } - if (sendU) { - xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; - nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; - } - for (i = 0, j = 0, p = 0; p < nprocs; p++) { - if ( p != iam ) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - ptrToRecv[p] = j; j += nnzToRecv[p]; - } - nnzToRecv[iam] = 0; - - ind_loc = ptrToRecv[iam]; - for (gb_n = 0; gb_n < nsupers; gb_n++) { - nsend_2 = 0; - i = xsup_n[gb_n]; - if (iam == OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid ); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - - if (sendL) { - p = pc; np = grid->nprow; - } else { - p = pr * grid->npcol; np = grid->npcol; - } - for (j = 0; j < np; j++) { - if (p == iam) { - rcv_luind[ind_loc] = gb_n; - rcv_luind[ind_loc+1] = 0; - tmp_ptrToSend[p] = ind_loc + 1; - ind_loc += 2; - } - else { - snd_luind[ptrToSend[p]] = gb_n; - snd_luind[ptrToSend[p]+1] = 0; - tmp_ptrToSend[p] = ptrToSend[p] + 1; - ptrToSend[p] += 2; - } - if (sendL) p += grid->npcol; - if (sendU) p++; - } - for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { - k = sub_s[j]; - if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { - gb = supno_n[k]; - if (sendL) - p = PNUM( PROW(gb, grid), pc, grid ); - else - p = PNUM( pr, PCOL(gb, grid), grid); - if (send_1[p] == FALSE) { - send_1[p] = TRUE; - send_2[nsend_2] = k; nsend_2 ++; - } - if (p == iam) { - rcv_luind[ind_loc] = k; ind_loc++; - if (sendL) - xsub_n[LBj( gb_n, grid )] ++; - else - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = k; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - if (sendL) - for (p = pc; p < nprocs; p += grid->npcol) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if (PNUM(PROW(gb, grid), pc, grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBj( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - if (sendU) - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - if (send_1[p] || p == p_diag) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if(PNUM( pr, PCOL(gb, grid), grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - } - } - } - - /* reset ptrToSnd to point to the beginning of the data for - each processor (structure needed in MPI_Alltoallv) */ - for (i = 0, p = 0; p < nprocs; p++) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - Note: it uses MPI_Alltoallv. - ------------------------------------------------------------*/ - if (nprocs > 1) { -#if defined (_LONGINT) - for (p=0; p INT_MAX || ptrToSend[p] > INT_MAX || - nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) nnzToSend[p]; - intBuf2[p] = (int) ptrToSend[p]; - intBuf3[p] = (int) nnzToRecv[p]; - intBuf4[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = nnzToSend; intBuf2 = ptrToSend; - intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; -#endif - - MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, - rcv_luind, intBuf3, intBuf4, mpi_int_t, - grid->comm); - } - if (sendL) - nnzToRecv[iam] = nnz_loc_l; - else - nnzToRecv[iam] = nnz_loc_u; - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE. - -------------------------------------------------------------*/ - if (sendU) - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - SUPERLU_FREE (snd_luind); - memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------ - CONVERT THE FORMAT. - ------------------------------------------------------------*/ - /* Initialize the array of column of L/ row of U pointers */ - k = 0; - for (p = 0; p < nprocs; p ++) { - if (p != iam) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - nelts = rcv_luind[i+1]; - if (sendL) - xsub_n[LBj( gb, grid )] = nelts; - else - xsub_n[LBi( gb, grid )] = nelts; - i += nelts + 2; - } - } - k += nnzToRecv[p]; - } - - if (sendL) j = nsupers_j; - else j = nsupers_i; - k = 0; - isize = xsub_n[0]; - xsub_n[0] = 0; - for (gb_l = 1; gb_l < j; gb_l++) { - k += isize; - isize = xsub_n[gb_l]; - xsub_n[gb_l] = k; - } - xsub_n[gb_l] = k + isize; - nnz_loc = xsub_n[gb_l]; - if (sendL) { - lsub_n = NULL; - if (nnz_loc) { - if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for lsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = lsub_n; - } - if (sendU) { - usub_n = NULL; - if (nnz_loc) { - if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for usub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = usub_n; - } - - /* Copy the data into the L column / U row oriented storage */ - k = 0; - for (p = 0; p < nprocs; p++) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - if (gb >= nsupers) - printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n", - iam, p, gb, nsupers, i, i-k); - i += 2; - if (sendL) gb_l = LBj( gb, grid ); - if (sendU) gb_l = LBi( gb, grid ); - for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { - sub_n[j] = rcv_luind[i]; - } - } - k += nnzToRecv[p]; - } - if (sendL) { - sendL = FALSE; sendU = TRUE; - } - else - sendU = FALSE; - } - - /* deallocate memory allocated during symbolic factorization routine */ - if (rcv_luind != NULL) { - SUPERLU_FREE (rcv_luind); - memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; - } - SUPERLU_FREE (mem); - memAux -= (float) (12 * nprocs * iword); - SUPERLU_FREE(nvtcs); - memAux -= (float) (5 * nprocs * sizeof(int)); - - if (xlsub_s != NULL) { - SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); - } - if (xusub_s != NULL) { - SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); - } - SUPERLU_FREE (globToLoc); - if (supno_s != NULL) { - SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); - SUPERLU_FREE (supno_s); - } - - Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; - *p_xlsub = xlsub_n; *p_lsub = lsub_n; - *p_xusub = xusub_n; *p_usub = usub_n; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dist_symbLU()"); -#endif - - return (-memRet); -} - -static float -ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val, - int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val, - int_t *ilsum_i, int_t *ilsum_j - ) -{ -/* - * - * Purpose - * ======= - * Re-distribute A on the 2D process mesh. The lower part is - * stored using a column format and the upper part - * is stored using a row format. - * - * Arguments - * ========= - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_persist (Input) Glu_persist_t * - * Information on supernodes mapping. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * p_ainf_colptr (Output) int_t** - * Pointer to the lower part of A distributed on a 2D grid - * of processors, stored by columns. - * - * p_ainf_rowind (Output) int_t** - * Structure of of the lower part of A distributed on a - * 2D grid of processors, stored by columns. - * - * p_ainf_val (Output) double** - * Numerical values of the lower part of A, distributed on a - * 2D grid of processors, stored by columns. - * - * p_asup_rowptr (Output) int_t** - * Pointer to the upper part of A distributed on a 2D grid - * of processors, stored by rows. - * - * p_asup_colind (Output) int_t** - * Structure of of the upper part of A distributed on a - * 2D grid of processors, stored by rows. - * - * p_asup_val (Output) double** - * Numerical values of the upper part of A, distributed on a - * 2D grid of processors, stored by rows. - * - * ilsum_i (Input) int_t * - * Starting position of each supernode in - * the full array (local, block row wise). - * - * ilsum_j (Input) int_t * - * Starting position of each supernode in - * the full array (local, block column wise). - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated when out of memory. - * (an approximation). - * - */ - int iam, p, procs; - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; - int_t nsupers, nsupers_i, nsupers_j; - int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - double *asup_val, *ainf_val; - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - double *aij, **aij_send, *nzval, *dtemp; - double *nzval_a; - MPI_Request *send_req; - MPI_Status status; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword, szbuf; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter ddist_A()"); -#endif - iword = sizeof(int_t); - dword = sizeof(double); - - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - if (!(nnzToRecv = intCalloc_dist(2*procs))) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (ERROR_RET); - } - memAux = (float) (2 * procs * iword); - memRet = 0.; - nnzToSend = nnzToRecv + procs; - nsupers = supno[n-1] + 1; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); - - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - szbuf = k; - - /* Allocate space for storing the triplets after redistribution. */ - if ( !(ia = intMalloc_dist(2*k)) ) { - fprintf (stderr, "Malloc fails for ia[]."); - return (memAux); - } - memAux += (float) (2*k*iword); - ja = ia + k; - if ( !(aij = doubleMalloc_dist(k)) ) { - fprintf (stderr, "Malloc fails for aij[]."); - return (memAux); - } - memAux += (float) (k*dword); - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { - fprintf (stderr, "Malloc fails for send_req[]."); - return (memAux); - } - memAux += (float) (2*procs *sizeof(MPI_Request)); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ia_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(int_t*)); - if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for aij_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(double*)); - if ( !(index = intMalloc_dist(2*SendCnt)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memAux); - } - memAux += (float) (2*SendCnt*iword); - if ( !(nzval = doubleMalloc_dist(SendCnt)) ) { - fprintf(stderr, "Malloc fails for nzval[]."); - return (memAux); - } - memAux += (float) (SendCnt * dword); - if ( !(ptr_to_send = intCalloc_dist(procs)) ) { - fprintf(stderr, "Malloc fails for ptr_to_send[]."); - return (memAux); - } - memAux += (float) (procs * iword); - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for itemp[]."); - return (memAux); - } - memAux += (float) (2*maxnnzToRecv*iword); - if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for dtemp[]."); - return (memAux); - } - memAux += (float) (maxnnzToRecv * dword); - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { - fprintf (stderr, "Malloc fails for *ainf_colptr[]."); - return (memAux); - } - memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; - if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { - fprintf (stderr, "Malloc fails for *asup_rowptr[]."); - return (memAux+memRet); - } - memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nnz_loc_ainf = nnz_loc_asup = 0; - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, MPI_DOUBLE, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - irow = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /* assert(jcol= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - MPI_Wait( &send_req[p], &status); - MPI_Wait( &send_req[procs+p], &status); - } - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - - SUPERLU_FREE(nnzToRecv); - memAux -= 2 * procs * iword; - if ( procs > 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - SUPERLU_FREE(ptr_to_send); - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + - procs*sizeof(double*) + 2*SendCnt * iword + - SendCnt* dword + procs*iword + - 2*maxnnzToRecv*iword + maxnnzToRecv*dword; - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT. - ------------------------------------------------------------*/ - if (nnz_loc_ainf != 0) { - if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_rowind[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * iword); - if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_val[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * dword); - } - else { - ainf_rowind = NULL; - ainf_val = NULL; - } - if (nnz_loc_asup != 0) { - if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_colind[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * iword); - if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_val[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * dword); - } - else { - asup_colind = NULL; - asup_val = NULL; - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = ainf_colptr[0]; ainf_colptr[0] = 0; - for (j = 1; j < ilsum_j[nsupers_j]; j++) { - k += jsize; - jsize = ainf_colptr[j]; - ainf_colptr[j] = k; - } - ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; - i = 0; - isize = asup_rowptr[0]; asup_rowptr[0] = 0; - for (j = 1; j < ilsum_i[nsupers_i]; j++) { - i += isize; - isize = asup_rowptr[j]; - asup_rowptr[j] = i; - } - asup_rowptr[ilsum_i[nsupers_i]] = i + isize; - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - jcol = ja[i]; - irow = ia[i]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); - k = ainf_colptr[j]; - ainf_rowind[k] = irow; - ainf_val[k] = aij[i]; - ainf_colptr[j] ++; - } - else { - j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); - k = asup_rowptr[j]; - asup_colind[k] = jcol; - asup_val[k] = aij[i]; - asup_rowptr[j] ++; - } - } - - /* Reset the column pointers to the beginning of each column */ - for (j = ilsum_j[nsupers_j]; j > 0; j--) - ainf_colptr[j] = ainf_colptr[j-1]; - for (j = ilsum_i[nsupers_i]; j > 0; j--) - asup_rowptr[j] = asup_rowptr[j-1]; - ainf_colptr[0] = 0; - asup_rowptr[0] = 0; - - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - memAux -= 2*szbuf*iword + szbuf*dword; - - *p_ainf_colptr = ainf_colptr; - *p_ainf_rowind = ainf_rowind; - *p_ainf_val = ainf_val; - *p_asup_rowptr = asup_rowptr; - *p_asup_colind = asup_colind; - *p_asup_val = asup_val; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit ddist_A()"); - fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); -#endif - - return (-memRet); -} /* dist_A */ - -int_t -ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Pslu_freeable_t *Pslu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * - * - * Purpose - * ======= - * Distribute the input matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * This routine should not be called for this case, an error - * is generated. Instead, pddistribute routine should be called. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (Input) int - * Dimension of the matrix. - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. - * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (Input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (Input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated for performing the distribution - * of the data, when out of memory. - * (an approximation). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t Glu_freeable_n; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, - len, len1, nsupc, nsupc_gb, ii, nprocs; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - double *a; - int_t *asub, *xa; - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - double *asup_val, *ainf_val; - int_t *xsup, *supno; /* supernode and column mapping */ - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; - int_t next_ind; /* next available position in index[*] */ - int_t next_val; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - double *lusup, *uval; /* nonzero values in L and U */ - int_t *recvBuf; - int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in - the full array (local, block column wise) */ - /*-- Auxiliary arrays; freed on return --*/ - int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *LUb_length; /* L,U block length; size nsupers_ij */ - int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ - int_t *LUb_number; /* global block number; size nsupers_ij */ - int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ - int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; - float memStrLU, memA, - memDist = 0.; /* memory used for redistributing the data, which does - not include the memory for the numerical values of L and U */ - float memNLU = 0.; /* memory allocated for storing the numerical values of - L and U, that will be used in the numeric factorization */ - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif - - /* Initialization. */ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); -#endif - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nprocs = grid->npcol * grid->nprow; - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - Astore = (NRformat_loc *) A->Store; - - iword = sizeof(int_t); - dword = sizeof(double); - - if (fact == SamePattern_SameRowPerm) { - ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); - } - - if ((memStrLU = - dist_symbLU (n, Pslu_freeable, - Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) - return (memStrLU); - memDist += (-memStrLU); - xsup = Glu_persist->xsup; /* supernode and column mapping */ - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ - nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ - nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); - if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for ilsum[]."); - return (memDist + memNLU); - } - memNLU += (nsupers_i+1) * iword; - if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for ilsum_j[]."); - return (memDist + memNLU); - } - memDist += (nsupers_j+1) * iword; - - /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ - ilsum[0] = 0; - ldaspa = 0; - for (gb = 0; gb < nsupers; gb++) - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - ilsum[nsupers_i] = ldaspa; - - ldaspa_j = 0; ilsum_j[0] = 0; - for (gb = 0; gb < nsupers; gb++) - if (mycol == PCOL( gb, grid )) { - i = SuperSize( gb ); - ldaspa_j += i; - lb = LBj( gb, grid ); - ilsum_j[lb + 1] = ilsum_j[lb] + i; - } - ilsum_j[nsupers_j] = ldaspa_j; - - if ((memA = ddist_A(A, ScalePermstruct, Glu_persist, - grid, &ainf_colptr, &ainf_rowind, &ainf_val, - &asup_rowptr, &asup_colind, &asup_val, - ilsum, ilsum_j)) > 0) - return (memDist + memA + memNLU); - memDist += (-memA); - - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ - - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - if ( !(ToRecv = intCalloc_dist(nsupers)) ) { - fprintf(stderr, "Calloc fails for ToRecv[]."); - return (memDist + memNLU); - } - memNLU += nsupers * iword; - - k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ToSendR[]."); - return (memDist + memNLU); - } - memNLU += k*sizeof(int_t*); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memDist + memNLU); - } - memNLU += j*iword; - - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - - /* Auxiliary arrays used to set up L and U block data structures. - They are freed on return. */ - if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_length[]."); - return (memDist + memNLU); - } - if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Malloc fails for LUb_indptr[]."); - return (memDist + memNLU); - } - if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_number[]."); - return (memDist + memNLU); - } - if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_valptr[]."); - return (memDist + memNLU); - } - memDist += 4 * nsupers_ij * iword; - - k = CEILING( nsupers, grid->nprow ); - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); - return (memDist + memNLU); - } - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*); - Unzval_br_ptr[nsupers_i-1] = NULL; - Ufstnz_br_ptr[nsupers_i-1] = NULL; - - if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Malloc fails for ToSendD[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*iword; - if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - memDist += (nsupers_i + nsupers_j)*iword; - - /* Auxiliary arrays used to set up L, U block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) - * sp_ienv_dist(3))) ) { - fprintf(stderr, "Calloc fails for SPA dense[]."); - return (memDist + memNLU); - } - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for fmod[]."); - return (memDist + memNLU); - } - if ( !(bmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for bmod[]."); - return (memDist + memNLU); - } - /* ------------------------------------------------ */ - memNLU += 2*nsupers_i*iword + - SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); - return (memDist + memNLU); - } - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*); - Lnzval_bc_ptr[nsupers_j-1] = NULL; - Lrowind_bc_ptr[nsupers_j-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[]."); - return (memDist + memNLU); - } - len = nsupers_j * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[]."); - return (memDist + memNLU); - } - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ - memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid); /* Local block number row wise */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - if ( myrow == jbrow ) { /* Block row jb in my process row */ - /* Scatter A into SPA. */ - for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { - for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { - if (i >= asup_rowptr[ilsum[nsupers_i]]) - printf ("ERR7\n"); - jcol = asup_colind[i]; - if (jcol >= n) - printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - gb = BlockNum( jcol ); - lb = LBj( gb, grid ); - if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); - jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); - if (jcol >= ldaspa_j) - printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - dense_col[jcol] = asup_val[i]; - } - dense_col += ldaspa_j; - } - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - /* Count number of blocks and length of each block. */ - nrbu = 0; - len = 0; /* Number of column subscripts I own. */ - len1 = 0; /* number of fstnz subscripts */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - if (i >= xusub[nsupers_i]) printf ("ERR10\n"); - jcol = usub[i]; - gb = BlockNum( jcol ); /* Global block number */ - - /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) - printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", - iam, jb, gb, jcol, jbcol, pc); */ - - lb = LBj( gb, grid ); /* Local block number */ - pc = PCOL( gb, grid ); /* Process col owning this block */ - if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; - /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ - pr = PROW( gb, grid ); - if ( pr != jbrow && mycol == pc) - bsendx_plist[lb][jbrow] = YES; - if (mycol == pc) { - len += nsupc; - LUb_length[lb] += nsupc; - ToSendD[ljb_i] = YES; - if (Urb_marker[lb] <= jb) { /* First see this block */ - if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; - Urb_marker[lb] = jb + 1; - LUb_number[nrbu] = gb; - /* if (gb == 391825 && jb == 145361) - printf ("Pe[%d] T1 [%d %d] nrbu %d \n", - iam, jb, gb, nrbu); */ - nrbu ++; - len1 += SuperSize( gb ); - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[ljb_i];/* Mod. count for back solve */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - } - } /* for i ... */ - - if ( nrbu ) { - /* Sort the blocks of U in increasing block column index. - SuperLU_DIST assumes this is true */ - /* simple insert sort algorithm */ - /* to be transformed in quick sort */ - for (j = 1; j < nrbu; j++) { - k = LUb_number[j]; - for (i=j-1; i>=0 && LUb_number[i] > k; i--) { - LUb_number[i+1] = LUb_number[i]; - } - LUb_number[i+1] = k; - } - - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) { - fprintf (stderr, "Malloc fails for Uindex[]"); - return (memDist + memNLU); - } - Ufstnz_br_ptr[ljb_i] = index; - if (!(Unzval_br_ptr[ljb_i] = - doubleMalloc_dist(len))) { - fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); - return (memDist + memNLU); - } - memNLU += (len1+1)*iword + len*dword; - uval = Unzval_br_ptr[ljb_i]; - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = nrbu; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index */ - index[len1] = -1; /* End marker */ - next_ind = BR_HEADER; - next_val = 0; - for (k = 0; k < nrbu; k++) { - gb = LUb_number[k]; - lb = LBj( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; /* Reset vector of block length */ - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) - index[next_ind] = FstBlockC( jb + 1 ); - LUb_valptr[lb] = next_val; - next_val += len; - } - /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], - and the initial values of A from SPA into Unzval_br_ptr[]. */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - jcol = usub[i]; - gb = BlockNum( jcol ); - - if ( mycol == PCOL( gb, grid ) ) { - lb = LBj( gb, grid ); - k = LUb_indptr[lb]; /* Start fstnz in index */ - index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); - } - } /* for i ... */ - - for (i = 0; i < nrbu; i++) { - gb = LUb_number[i]; - lb = LBj( gb, grid ); - next_ind = LUb_indptr[lb]; - k = FstBlockC( jb + 1); - jcol = ilsum_j[lb]; - for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { - dense_col = dense; - j = index[next_ind+jj]; - for (ii = j; ii < k; ii++) { - uval[LUb_valptr[lb]++] = dense_col[jcol]; - dense_col[jcol] = zero; - dense_col += ldaspa_j; - } - } - } - } else { - Ufstnz_br_ptr[ljb_i] = NULL; - Unzval_br_ptr[ljb_i] = NULL; - } /* if nrbu ... */ - } /* if myrow == jbrow */ - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - if (mycol == jbcol) { /* Block column jb in my process column */ - /* Scatter A_inf into SPA. */ - for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { - for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { - irow = ainf_rowind[i]; - if (irow >= n) printf ("Pe[%d] ERR1\n", iam); - gb = BlockNum( irow ); - if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); - dense_col[irow] = ainf_val[i]; - } - } - dense_col += ldaspa; - } - - /* sort the indices of the diagonal block at the beginning of xlsub */ - if (myrow == jbrow) { - k = xlsub[ljb_j]; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - if (irow < nsupc + fsupc && i != k+irow-fsupc) { - lsub[i] = lsub[k + irow - fsupc]; - lsub[k + irow - fsupc] = irow; - i --; - } - } - } - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && - myrow == jbrow) { - fsendx_plist[ljb_j][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (Lrb_marker[lb] <= jb) { /* First see this block */ - Lrb_marker[lb] = jb + 1; - LUb_length[lb] = 1; - LUb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else - ++LUb_length[lb]; - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* If I am the owner of the diagonal block, order it first in LUb_number. - Necessary for SuperLU_DIST routines */ - kseen = EMPTY; - for (j = 0; j < nrbl; j++) { - if (LUb_number[j] == jb) - kseen = j; - } - if (kseen != EMPTY && kseen != 0) { - LUb_number[kseen] = LUb_number[0]; - LUb_number[0] = jb; - } - - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) { - fprintf (stderr, "Malloc fails for index[]"); - return (memDist + memNLU); - } - Lrowind_bc_ptr[ljb_j] = index; - if (!(Lnzval_bc_ptr[ljb_j] = - doubleMalloc_dist(len*nsupc))) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb); - return (memDist + memNLU); - } - memNLU += len1*iword + len*nsupc*dword; - - lusup = Lnzval_bc_ptr[ljb_j]; - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_ind = BC_HEADER; - next_val = 0; - for (k = 0; k < nrbl; ++k) { - gb = LUb_number[k]; - lb = LBi( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - LUb_valptr[lb] = next_val; - next_ind += len; - next_val += len; - } - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - len = index[1]; /* LDA of lusup[] */ - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = LUb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = LUb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb_j] = NULL; - Lnzval_bc_ptr[ljb_j] = NULL; - } /* if nrbl ... */ - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(ilsum_j); - SUPERLU_FREE(Urb_marker); - SUPERLU_FREE(LUb_length); - SUPERLU_FREE(LUb_indptr); - SUPERLU_FREE(LUb_number); - SUPERLU_FREE(LUb_valptr); - SUPERLU_FREE(Lrb_marker); - SUPERLU_FREE(dense); - - /* Free the memory used for storing L and U */ - SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); - if (lsub != NULL) - SUPERLU_FREE(lsub); - if (usub != NULL) - SUPERLU_FREE(usub); - - /* Free the memory used for storing A */ - SUPERLU_FREE(ainf_colptr); - if (ainf_rowind != NULL) { - SUPERLU_FREE(ainf_rowind); - SUPERLU_FREE(ainf_val); - } - SUPERLU_FREE(asup_rowptr); - if (asup_colind != NULL) { - SUPERLU_FREE(asup_colind); - SUPERLU_FREE(asup_val); - } - - /* exchange information about bsendx_plist in between column of processors */ - k = SUPERLU_MAX( grid->nprow, grid->npcol); - if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { - fprintf (stderr, "Malloc fails for recvBuf[]."); - return (memDist + memNLU); - } - if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - - if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) - memDist = nsupers*k*iword +4*nprocs * sizeof(int); - - for (p = 0; p < nprocs; p++) - nnzToRecv[p] = 0; - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - nnzToRecv[p] += grid->npcol; - } - i = 0; - for (p = 0; p < nprocs; p++) { - ptrToRecv[p] = i; - i += nnzToRecv[p]; - ptrToSend[p] = 0; - if (p != iam) - nnzToSend[p] = nnzToRecv[iam]; - else - nnzToSend[p] = 0; - } - nnzToRecv[iam] = 0; - i = ptrToRecv[iam]; - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - if (p == iam) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - for (j = 0; j < grid->npcol; j++, i++) - recvBuf[i] = ToSendR[ljb_j][j]; - } - } - - MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, - recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid ); /* Local block number row wise */ - /* (myrow == jbrow) { - if (ToSendD[ljb_i] == YES) - ToRecv[jb] = 1; - } - else { - if (recvBuf[ptrToRecv[p] + mycol] == YES) - ToRecv[jb] = 2; - } */ - if (recvBuf[ptrToRecv[p] + mycol] == YES) { - if (myrow == jbrow) - ToRecv[jb] = 1; - else - ToRecv[jb] = 2; - } - if (mycol == jbcol) { - for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) - ToSendR[ljb_j][i] = recvBuf[j]; - ToSendR[ljb_j][mycol] = EMPTY; - } - ptrToRecv[p] += grid->npcol; - } - - /* exchange information about bsendx_plist in between column of processors */ - MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, - MPI_MAX, grid->cscp.comm); - - for (jb = 0; jb < nsupers; jb ++) { - jbcol = PCOL( jb, grid); - jbrow = PROW( jb, grid); - if (mycol == jbcol) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - if (myrow == jbrow ) { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { - (*bsendx_plist)[k] = recvBuf[k]; - if ((*bsendx_plist)[k] != EMPTY) - nbsendx ++; - } - } - else { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) - (*bsendx_plist)[k] = EMPTY; - } - } - } - - SUPERLU_FREE(nnzToRecv); - SUPERLU_FREE(ptrToRecv); - SUPERLU_FREE(nnzToSend); - SUPERLU_FREE(ptrToSend); - SUPERLU_FREE(recvBuf); - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - LUstruct->Glu_persist = Glu_persist; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist, - ToRecv, ToSendR, ToSendD - */ - CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); -#endif - - return (- (memDist+memNLU)); -} /* dist_psymbtonum */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdutil.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,529 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_ddefs.h" - -/* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - */ -int pdCompRow_loc_to_CompCol_global -( - int_t need_value, /* Input. Whether need to gather numerical values */ - SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ - gridinfo_t *grid, /* Input */ - SuperMatrix *GA /* Output */ -) -{ - NRformat_loc *Astore; - NCformat *GAstore; - double *a, *a_loc; - int_t *colind, *rowptr; - int_t *colptr_loc, *rowind_loc; - int_t m_loc, n, i, j, k, l; - int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz; - double *a_recv; /* Buffer to receive the blocks of values. */ - double *a_buf; /* Buffer to merge blocks into block columns. */ - int_t *colcnt, *itemp; - int_t *colptr_send; /* Buffer to redistribute the column pointers of the - local block rows. - Use n_loc+1 pointers for each block. */ - int_t *colptr_blk; /* The column pointers for each block, after - redistribution to the local block columns. - Use n_loc+1 pointers for each block. */ - int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ - int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ - int_t *fst_rows, *n_locs; - int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; - int it, n_loc, procs; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdCompRow_loc_to_CompCol_global"); -#endif - - /* Initialization. */ - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ - - /* ------------------------------------------------------------ - FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. - ------------------------------------------------------------*/ - dCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, - &rowind_loc, &colptr_loc); - /* Change local row index numbers to global numbers. */ - for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; - -#if ( DEBUGlevel>=2 ) - printf("Proc %d\n", grid->iam); - PrintInt10("rowind_loc", nnz_loc, rowind_loc); - PrintInt10("colptr_loc", n+1, colptr_loc); -#endif - - procs = grid->nprow * grid->npcol; - if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) - ABORT("Malloc fails for fst_rows[]"); - n_locs = fst_rows + procs; - MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, - grid->comm); - for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; - n_locs[procs-1] = n - fst_rows[procs-1]; - if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) - ABORT("Malloc fails for recvcnts[]"); - sendcnts = recvcnts + procs; - rdispls = sendcnts + procs; - sdispls = rdispls + procs; - itemp_32 = sdispls + procs; - - /* All-to-all transfer column pointers of each block. - Now the matrix view is P-by-P block-partition. */ - /* n column starts for each column, and procs column ends for each block */ - if ( !(colptr_send = intMalloc_dist(n + procs)) ) - ABORT("Malloc fails for colptr_send[]"); - if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) - ABORT("Malloc fails for colptr_blk[]"); - for (i = 0, j = 0; i < procs; ++i) { - for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; - colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ - sendcnts[i] = n_locs[i] + 1; -#if ( DEBUGlevel>=1 ) - assert(j == fst_rows[i]); -#endif - sdispls[i] = j + i; - recvcnts[i] = n_loc + 1; - rdispls[i] = i * (n_loc + 1); - j += n_locs[i]; /* First column of next block in colptr_loc[] */ - } - MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, - colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); - - /* Adjust colptr_blk[] so that they contain the local indices of the - column pointers in the receive buffer. */ - nnz = 0; /* The running sum of the nonzeros counted by far */ - k = 0; - for (i = 0; i < procs; ++i) { - for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { - colnnz = colptr_blk[j+1] - colptr_blk[j]; - /*assert(k<=j);*/ - colptr_blk[k] = nnz; - nnz += colnnz; /* Start of the next column */ - ++k; - } - colptr_blk[k++] = nnz; /* Add an END marker for each block */ - } - /*assert(k == (n_loc+1)*procs);*/ - - /* Now prepare to transfer row indices and values. */ - sdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; - sdispls[i+1] = sdispls[i] + sendcnts[i]; - } - sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; - for (i = 0; i < procs; ++i) { - j = rdispls[i]; /* Point to this block in colptr_blk[]. */ - recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; - } - rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ - for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; - - k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ - if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) - ABORT("Malloc fails for rowind_recv[]"); - rowind_buf = rowind_recv + k; - MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, - rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); - if ( need_value ) { - if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) ) - ABORT("Malloc fails for rowind_recv[]"); - a_buf = a_recv + k; - MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE, - a_recv, recvcnts, rdispls, MPI_DOUBLE, - grid->comm); - } - - /* Reset colptr_loc[] to point to the n_loc global columns. */ - colptr_loc[0] = 0; - itemp = colptr_send; - for (j = 0; j < n_loc; ++j) { - colnnz = 0; - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1) + j; /* j-th column in i-th block */ - colnnz += colptr_blk[k+1] - colptr_blk[k]; - } - colptr_loc[j+1] = colptr_loc[j] + colnnz; - itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ - } - itemp[n_loc] = colptr_loc[n_loc]; - - /* Merge blocks of row indices into columns of row indices. */ - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) { /* i-th block */ - for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { - rowind_buf[itemp[j]] = rowind_recv[l]; - ++itemp[j]; - } - } - } - - if ( need_value ) { - for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) { /* i-th block */ - for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { - a_buf[itemp[j]] = a_recv[l]; - ++itemp[j]; - } - } - } - } - - /* ------------------------------------------------------------ - SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. - ------------------------------------------------------------*/ - GA->nrow = A->nrow; - GA->ncol = A->ncol; - GA->Stype = SLU_NC; - GA->Dtype = A->Dtype; - GA->Mtype = A->Mtype; - GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); - if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); - - /* First gather the size of each piece. */ - nnz_loc = colptr_loc[n_loc]; - MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); - for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; - GAstore->nnz = nnz; - - if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); - if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); - - /* Allgatherv for row indices. */ - rdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - rdispls[i+1] = rdispls[i] + itemp[i]; - itemp_32[i] = itemp[i]; - } - itemp_32[procs-1] = itemp[procs-1]; - it = nnz_loc; - MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, - itemp_32, rdispls, mpi_int_t, grid->comm); - if ( need_value ) { - if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); - MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, - itemp_32, rdispls, MPI_DOUBLE, grid->comm); - } else GAstore->nzval = NULL; - - /* Now gather the column pointers. */ - rdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - rdispls[i+1] = rdispls[i] + n_locs[i]; - itemp_32[i] = n_locs[i]; - } - itemp_32[procs-1] = n_locs[procs-1]; - MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, - itemp_32, rdispls, mpi_int_t, grid->comm); - - /* Recompute column pointers. */ - for (i = 1; i < procs; ++i) { - k = rdispls[i]; - for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; - itemp[i] += itemp[i-1]; /* prefix sum */ - } - GAstore->colptr[n] = nnz; - -#if ( DEBUGlevel>=2 ) - if ( !grid->iam ) { - printf("After pdCompRow_loc_to_CompCol_global()\n"); - dPrint_CompCol_Matrix_dist(GA); - } -#endif - - SUPERLU_FREE(a_loc); - SUPERLU_FREE(rowind_loc); - SUPERLU_FREE(colptr_loc); - SUPERLU_FREE(fst_rows); - SUPERLU_FREE(recvcnts); - SUPERLU_FREE(colptr_send); - SUPERLU_FREE(colptr_blk); - SUPERLU_FREE(rowind_recv); - if ( need_value) SUPERLU_FREE(a_recv); -#if ( DEBUGlevel>=1 ) - if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat)); - CHECK_MALLOC(grid->iam, "Exit pdCompRow_loc_to_CompCol_global"); -#endif - return 0; -} /* pdCompRow_loc_to_CompCol_global */ - - -/* - * Permute the distributed dense matrix: B <= perm(X). - * perm[i] = j means the i-th row of X is in the j-th row of B. - */ -int pdPermute_Dense_Matrix -( - int_t fst_row, - int_t m_loc, - int_t row_to_proc[], - int_t perm[], - double X[], int ldx, - double B[], int ldb, - int nrhs, - gridinfo_t *grid -) -{ - int_t i, j, k, l; - int p, procs; - int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdPermute_Dense_Matrix()"); -#endif - - procs = grid->nprow * grid->npcol; - if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) - ABORT("Malloc fails for sendcnts[]."); - sendcnts_nrhs = sendcnts + procs; - recvcnts = sendcnts_nrhs + procs; - recvcnts_nrhs = recvcnts + procs; - sdispls = recvcnts_nrhs + procs; - sdispls_nrhs = sdispls + procs; - rdispls = sdispls_nrhs + procs; - rdispls_nrhs = rdispls + procs; - ptr_to_ibuf = rdispls_nrhs + procs; - ptr_to_dbuf = ptr_to_ibuf + procs; - - for (i = 0; i < procs; ++i) sendcnts[i] = 0; - - /* Count the number of X entries to be sent to each process.*/ - for (i = fst_row; i < fst_row + m_loc; ++i) { - p = row_to_proc[perm[i]]; - ++sendcnts[p]; - } - MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); - sdispls[0] = rdispls[0] = 0; - sdispls_nrhs[0] = rdispls_nrhs[0] = 0; - sendcnts_nrhs[0] = sendcnts[0] * nrhs; - recvcnts_nrhs[0] = recvcnts[0] * nrhs; - for (i = 1; i < procs; ++i) { - sdispls[i] = sdispls[i-1] + sendcnts[i-1]; - sdispls_nrhs[i] = sdispls[i] * nrhs; - rdispls[i] = rdispls[i-1] + recvcnts[i-1]; - rdispls_nrhs[i] = rdispls[i] * nrhs; - sendcnts_nrhs[i] = sendcnts[i] * nrhs; - recvcnts_nrhs[i] = recvcnts[i] * nrhs; - } - k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ - l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ - /*assert(k == m_loc);*/ - /*assert(l == m_loc);*/ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - - for (i = 0; i < procs; ++i) { - ptr_to_ibuf[i] = sdispls[i]; - ptr_to_dbuf[i] = sdispls_nrhs[i]; - } - - /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ - for (i = fst_row; i < fst_row + m_loc; ++i) { - j = perm[i]; - p = row_to_proc[j]; - send_ibuf[ptr_to_ibuf[p]] = j; - j = ptr_to_dbuf[p]; - RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ - send_dbuf[j++] = X[i-fst_row + k*ldx]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; - } - - /* Transfer the (permuted) row indices and numerical values. */ - MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, - recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); - MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); - - /* Copy the buffer into b. */ - for (i = 0, l = 0; i < m_loc; ++i) { - j = recv_ibuf[i] - fst_row; /* Relative row number */ - RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ - B[j + k*ldb] = recv_dbuf[l++]; - } - } - - SUPERLU_FREE(sendcnts); - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdPermute_Dense_Matrix()"); -#endif - return 0; -} /* pdPermute_Dense_Matrix */ - - -/* - * Initialize the data structure for the solution phase. - */ -int dSolveInit(superlu_options_t *options, SuperMatrix *A, - int_t perm_r[], int_t perm_c[], int_t nrhs, - LUstruct_t *LUstruct, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) -{ - int_t *row_to_proc, *inv_perm_c, *itemp; - NRformat_loc *Astore; - int_t i, fst_row, m_loc, p; - int procs; - - Astore = (NRformat_loc *) A->Store; - fst_row = Astore->fst_row; - m_loc = Astore->m_loc; - procs = grid->nprow * grid->npcol; - - if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) - ABORT("Malloc fails for row_to_proc[]"); - SOLVEstruct->row_to_proc = row_to_proc; - if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) - ABORT("Malloc fails for inv_perm_c[]."); - for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; - SOLVEstruct->inv_perm_c = inv_perm_c; - - /* ------------------------------------------------------------ - EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. - SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. - - NOTE: For those processes that do not own any row, it must - must be set so that fst_row == A->nrow. - ------------------------------------------------------------*/ - if ( !(itemp = intMalloc_dist(procs+1)) ) - ABORT("Malloc fails for itemp[]"); - MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); - itemp[procs] = A->nrow; - for (p = 0; p < procs; ++p) { - for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; - } -#if ( DEBUGlevel>=2 ) - if ( !grid->iam ) { - printf("fst_row = %d\n", fst_row); - PrintInt10("row_to_proc", A->nrow, row_to_proc); - PrintInt10("inv_perm_c", A->ncol, inv_perm_c); - } -#endif - SUPERLU_FREE(itemp); - -#if 0 - /* Compute the mapping between rows and processes. */ - /* XSL NOTE: What happens if # of mapped processes is smaller - than total Procs? For the processes without any row, let - fst_row be EMPTY (-1). Make sure this case works! */ - MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); - itemp[procs] = n; - for (p = 0; p < procs; ++p) { - j = itemp[p]; - if ( j != EMPTY ) { - k = itemp[p+1]; - if ( k == EMPTY ) k = n; - for (i = j ; i < k; ++i) row_to_proc[i] = p; - } - } -#endif - - get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, - &SOLVEstruct->num_diag_procs, - &SOLVEstruct->diag_procs, - &SOLVEstruct->diag_len); - - if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) - ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, - LUstruct->Glu_persist, SOLVEstruct); - - if ( !(SOLVEstruct->gsmv_comm = (pdgsmv_comm_t *) - SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) ) - ABORT("Malloc fails for gsmv_comm[]"); - SOLVEstruct->A_colind_gsmv = NULL; - - options->SolveInitialized = YES; - return 0; -} /* dSolveInit */ - -/* - * Release the resources used for the solution phase. - */ -void dSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct) -{ - int_t *it; - pxgstrs_finalize(SOLVEstruct->gstrs_comm); - if ( options->RefineInitialized ) { - pdgsmv_finalize(SOLVEstruct->gsmv_comm); - options->RefineInitialized = NO; - } - SUPERLU_FREE(SOLVEstruct->gsmv_comm); - SUPERLU_FREE(SOLVEstruct->row_to_proc); - SUPERLU_FREE(SOLVEstruct->inv_perm_c); - SUPERLU_FREE(SOLVEstruct->diag_procs); - SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); - options->SolveInitialized = NO; -} /* dSolveFinalize */ - -/* - * Check the inf-norm of the error vector - */ -void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, - double xtrue[], int_t ldxtrue, gridinfo_t *grid) -{ - double err, xnorm, temperr, tempxnorm; - double *x_work, *xtrue_work; - int i, j; - - for (j = 0; j < nrhs; j++) { - x_work = &x[j*ldx]; - xtrue_work = &xtrue[j*ldxtrue]; - err = xnorm = 0.0; - for (i = 0; i < n; i++) { - err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); - xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); - } - - /* get the golbal max err & xnrom */ - temperr = err; - tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - - err = err / xnorm; - if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); - } -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,5174 +0,0 @@ - -/* - * -- Parallel symbolic factorization routine (version 2.2) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 - * INRIA France - January 2004 - * Laura Grigori - * - * November 1, 2007 - * Feburary 20, 2008 - */ - - -/* - * The function symbfact_dist implements the parallel symbolic factorization - * algorithm described in the paper: - * - * Parallel Symbolic Factorization for Sparse LU with Static Pivoting, - * Laura Grigori, James W. Demmel and Xiaoye S. Li, - * Pages 1289-1314, SIAM Journal on Scientific Computing, Volume 29, Issue 3. - * - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include -#include -#include "superlu_ddefs.h" -#include "psymbfact.h" - -/* - * Internal protypes - */ - -static int_t * -intMalloc_symbfact(int_t ); - -static int_t * -intCalloc_symbfact(int_t ); - -static int_t -initParmsAndStats -(psymbfact_stat_t *PS); - -static void -estimate_memUsage -(int_t, int, mem_usage_t *, float *, float *, - Pslu_freeable_t *, Llu_symbfact_t *, - vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *); - -static void -symbfact_free -(int, int, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *); - -static int_t -denseSep_symbfact -(int , int_t, int, int, int, int_t *, int_t *, int, - int, int, int_t, int_t, int_t *, int_t *, int_t *, - int_t *, int_t *, MPI_Comm, MPI_Comm *, Llu_symbfact_t *, - Pslu_freeable_t *_freeable, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t * ); - -static int_t -dnsUpSeps_symbfact -(int_t, int, int, int, int, int_t *, int_t *, int_t, - Llu_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *); - -static void -intraLvl_symbfact -(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int, - int, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *); - -static void -initLvl_symbfact -(int_t, int, int_t, int_t, Pslu_freeable_t *, - Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, - int_t *, int_t, int_t); - -static void -createComm (int, int, MPI_Comm *, MPI_Comm *); - -static void -freeComm (int, int, MPI_Comm *, MPI_Comm *); - -static void -domain_symbfact -(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, - int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *); - -static float -allocPrune_domain -(int_t, int_t, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -static float -allocPrune_lvl -(Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -static int -symbfact_alloc -(int_t, int, Pslu_freeable_t *, Llu_symbfact_t *, - vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *); - -static float -symbfact_mapVtcs -(int, int, int, SuperMatrix *, int_t *, int_t *, - Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, int_t, psymbfact_stat_t *); - -static void -symbfact_distributeMatrix -(int, int, int, SuperMatrix *, int_t *, int_t *, matrix_symbfact_t *, - Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, MPI_Comm *); - -static int_t -interLvl_symbfact -(SuperMatrix *, int, int, int, int, int, int, int, - int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, - Llu_symbfact_t *, Pslu_freeable_t*, comm_symbfact_t *, vtcsInfo_symbfact_t *, - psymbfact_stat_t *, MPI_Comm, MPI_Comm *); - -static float -cntsVtcs -(int_t, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, - int_t *, int_t *, int_t *, psymbfact_stat_t *, MPI_Comm *); - -/************************************************************************/ -float symbfact_dist -/************************************************************************/ -( - int nprocs_num, /* Input - no of processors */ - int nprocs_symb, /* Input - no of processors for the symbolic - factorization */ - SuperMatrix *A, /* Input - distributed input matrix */ - int_t *perm_c, /* Input - column permutation */ - int_t *perm_r, /* Input - row permutation */ - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, - global to local indexing information */ - MPI_Comm *num_comm, /* Input - communicator for numerical factorization */ - MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ - mem_usage_t *symb_mem_usage - ) -{ -/* - * Purpose - * ======= - * symbfact_dist() performs symbolic factorization of matrix A suitable - * for performing the supernodal Gaussian elimination with no pivoting (GEPP). - * This routine computes the structure of one column of L and one row of U - * at a time. It uses: - * o distributed input matrix - * o supernodes - * o symmetric structure pruning - * - * - * Arguments - * ========= - * - * nprocs_num (input) int - * Number of processors SuperLU_DIST is executed on, and the input - * matrix is distributed on. - * - * nprocs_symb (input) int - * Number of processors on which the symbolic factorization is - * performed. It is equal to the number of independent domains - * idenfied in the graph partitioning algorithm executed - * previously and has to be a power of 2. It corresponds to - * number of leaves in the separator tree. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The - * number of the linear equations is A->nrow. Matrix A is - * distributed in NRformat_loc format. - * Matrix A is not yet permuted by perm_c. - * - * perm_c (input) int_t* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int_t* - * Row permutation vector of size A->nrow, which defines the - * permutation matrix Pr; perm_r[i] = j means column i of A is - * in position j in Pr*A. - * - * sizes (input) int_t* - * Contains the number of vertices in each separator. - * - * fstVtxSep (input) int_t* - * Contains first vertex for each separator. - * - * Pslu_freeable (output) Pslu_freeable_t* - * Returns the local L and U structure, and global to local - * information on the indexing of the vertices. Contains all - * the information necessary for performing the data - * distribution towards the numeric factorization. - * - * num_comm (input) MPI_Comm* - * Communicator for numerical factorization - * - * symb_comm (input) MPI_Comm* - * Communicator for symbolic factorization - * - * symb_mem_usage (input) mem_usage_t * - * Statistics on memory usage. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the symbolic factorization. - * > 0, number of bytes allocated when out of memory. - * - * Sketch of the algorithm - * ======================= - * - * Distrbute the vertices on the processors using a subtree to - * subcube algorithm. - * - * Redistribute the structure of the input matrix A according to the - * subtree to subcube computed previously for the symbolic - * factorization routine. This implies in particular a distribution - * from nprocs_num processors to nprocs_symb processors. - * - * Perform symbolic factorization guided by the separator tree provided by - * a graph partitioning algorithm. The symbolic factorization uses a - * combined left-looking, right-looking approach. - * - */ - NRformat_loc *Astore; - int iam, szSep, fstP, lstP, npNode, nlvls, lvl, p, iSep, jSep; - int iinfo; /* return code */ - int_t m, n; - int_t nextl, nextu, neltsZr, neltsTotal, nsuper_loc, szLGr, szUGr; - int_t ind_blk, nsuper, vtx, min_mn, nnzL, nnzU, szsn; - float stat_loc[23], stat_glob[23], mem_glob[15]; - - Llu_symbfact_t Llu_symbfact; /* local L and U and pruned L and U data structures */ - vtcsInfo_symbfact_t VInfo; /* local information on number of blocks, - number of vertices in a block etc */ - matrix_symbfact_t AS; /* temporary storage for the input matrix after redistribution */ - comm_symbfact_t CS; /* information on communication */ - /* relaxation parameters (for future release) and - statistics collected during the symbolic factorization */ - psymbfact_stat_t PS; - /* temp array of size n, used as a marker by the subroutines */ - int_t *tempArray; - int_t i, j, k; - int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc; - int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd; - float totalMemLU, overestimMem; - MPI_Comm *commLvls; - - /* maximum block size */ - int_t maxSzBlk; - float flinfo; -#if ( PRNTlevel >= 1) - float stat_msgs_l[10], stat_msgs_g[10]; -#endif -#if ( PROFlevel>=1 ) - double t, t_symbFact[3], t_symbFact_loc[3]; - double *time_lvlsT, *time_lvls, t1, t2, time_lvlsg[9]; -#endif - - /* Initialization */ - MPI_Comm_rank ((*num_comm), &iam); - commLvls = NULL; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter psymbfact()"); -#endif - initParmsAndStats (&PS); - if (nprocs_symb != 1) { - if (!(commLvls = (MPI_Comm *) SUPERLU_MALLOC(2*nprocs_symb*sizeof(MPI_Comm)))) { - fprintf (stderr, "Malloc fails for commLvls[]."); - return (PS.allocMem); - } - PS.allocMem += 2 * nprocs_symb * sizeof(MPI_Comm); - } - - nlvls = (int) LOG2( nprocs_num ) + 1; -#if ( PROFlevel>=1 ) - time_lvlsT = (double *) SUPERLU_MALLOC(3*nprocs_symb*(nlvls+1) - * sizeof(double)); - time_lvls = (double *) SUPERLU_MALLOC(3*(nlvls+1) * sizeof(double)); - if (!time_lvls || !time_lvlsT) { - fprintf (stderr, "Malloc fails for time_lvls[]."); - return (PS.allocMem); - } - PS.allocMem += (3*nprocs_symb*(nlvls+1) + 3*(nlvls+1)) * sizeof(double); -#endif - - VInfo.xlsub_nextLvl = 0; - VInfo.xusub_nextLvl = 0; - VInfo.maxSzBlk = sp_ienv_dist(3); - maxSzBlk = VInfo.maxSzBlk; - - mark = EMPTY; - nsuper_loc = 0; - nextl = 0; nextu = 0; - neltsZr = 0; neltsTotal = 0; - - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN( m, n ); - - if (!(tempArray = intMalloc_symbfact(n))) { - fprintf (stderr, "Malloc fails for tempArray[].\n"); - return (PS.allocMem); - } - PS.allocMem += n * sizeof(int_t); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - /* Distribute vertices on processors */ - if ((flinfo = - symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes, - Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0) - return (flinfo); - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - - /* Redistribute matrix A on processors following the distribution found - in symbfact_mapVtcs. Store the redistributed A temporarily into AS */ - symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb, A, - perm_c, perm_r, &AS, - Pslu_freeable, &VInfo, tempArray, num_comm); - - /* THE REST OF THE SYMBOLIC FACTORIZATION IS EXECUTED ONLY BY NPROCS_SYMB - PROCESSORS */ - if ( iam < nprocs_symb ) { - -#if ( PROFlevel>=1 ) - t_symbFact_loc[0] = SuperLU_timer_() - t; - t = SuperLU_timer_(); - t_symbFact_loc[1] = t; -#endif - - /* Allocate storage common to the symbolic factor routines */ - if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, - &Llu_symbfact, &VInfo, &CS, &PS)) - return (PS.allocMem); - - /* Copy the redistributed input matrix AS at the end of the memory buffer - allocated to store L and U. That is, copy (AS.x_ainf, AS.ind_ainf) in - (xlsub, lsub), (AS.x_asup, AS.ind_asup) in (xusub, usub). Free the - memory used to store the input matrix */ - nnz_ainf_loc = VInfo.nnz_ainf_loc; - nnz_asup_loc = VInfo.nnz_asup_loc; - j = Llu_symbfact.szUsub - VInfo.nnz_asup_loc; - k = Llu_symbfact.szLsub - VInfo.nnz_ainf_loc; - for (i = 0; i <= VInfo.nvtcs_loc; i++) { - Llu_symbfact.xusub[i] = AS.x_asup[i] + j; - Llu_symbfact.xlsub[i] = AS.x_ainf[i] + k; - } - - for (i = 0; i < VInfo.nnz_asup_loc; i++, j++) - Llu_symbfact.usub[j] = AS.ind_asup[i]; - for (i = 0; i < VInfo.nnz_ainf_loc; i++, k++) - Llu_symbfact.lsub[k] = AS.ind_ainf[i]; - SUPERLU_FREE( AS.x_ainf ); - SUPERLU_FREE( AS.x_asup ); - SUPERLU_FREE( AS.ind_ainf ); - SUPERLU_FREE( AS.ind_asup ); - - if (nprocs_symb != 1) { - createComm (iam, nprocs_symb, commLvls, symb_comm); - -#if ( PROFlevel>=1 ) - t_symbFact_loc[2] = SuperLU_timer_(); -#endif - if ((flinfo = cntsVtcs (n, iam, nprocs_symb, Pslu_freeable, &Llu_symbfact, - &VInfo, tempArray, fstVtxSep, sizes, &PS, commLvls)) > 0) - return (flinfo); - -#if ( PROFlevel>=1 ) - t_symbFact_loc[2] = SuperLU_timer_() - t_symbFact_loc[2]; -#endif - } - - /* set to EMPTY marker[] array */ - for (i = 0; i < n; i++) - tempArray[i] = EMPTY; - - szSep = nprocs_symb; - iSep = 0; - lvl = 0; - while (szSep >= 1) { - /* for each level in the separator tree */ - npNode = nprocs_symb / szSep; - fstP = 0; - /* for each node in the level */ - for (jSep = iSep; jSep < iSep + szSep; jSep++) { - fstVtx = fstVtxSep[jSep]; - lstVtx = fstVtx + sizes[jSep]; - /* if this is the first level */ - if (szSep == nprocs_symb) { - /* compute symbolic factorization for my domain */ - if (fstP == iam) { - /* allocate storage for the pruned structures */ -#if ( PROFlevel>=1 ) - t1 = SuperLU_timer_(); -#endif - if ((flinfo = allocPrune_domain (fstVtx, lstVtx, - &Llu_symbfact, &VInfo, &PS)) > 0) - return (flinfo); - if (fstVtx < lstVtx) - VInfo.fstVtx_nextLvl = VInfo.begEndBlks_loc[2]; - - domain_symbfact - (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstVtx, lstVtx, - Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, tempArray, - &mark, &nextl, &nextu, &neltsZr, &neltsTotal, &nsuper_loc); - - PS.estimLSz = nextl; - PS.estimUSz = nextu; - if (nprocs_symb != 1) - if((flinfo = allocPrune_lvl (&Llu_symbfact, &VInfo, &PS)) > 0) - return (flinfo); -#if ( PROFlevel>=1 ) - t2 = SuperLU_timer_(); - time_lvls[lvl] = 0.; time_lvls[lvl+1] = 0.; - time_lvls[lvl + 2] = t2 - t1; -#endif - } - } - else { - lstP = fstP + npNode; - if (fstP <= iam && iam < lstP) { -#if ( PROFlevel>=1 ) - t1 = SuperLU_timer_(); -#endif - if (VInfo.filledSep != FILLED_SEPS) - initLvl_symbfact(n, iam, fstVtx, lstVtx, - Pslu_freeable, &Llu_symbfact, &VInfo, &PS, commLvls[jSep], - tempArray, nextl, nextu); -#if ( PROFlevel>=1 ) - t2 = SuperLU_timer_(); - time_lvls[3*lvl] = t2 - t1; -#endif - interLvl_symbfact (A, iam, lvl, szSep, fstP, lstP, - iSep, jSep, sizes, fstVtxSep, - &nextl, &nextu, &nsuper_loc, &mark, tempArray, - &Llu_symbfact, Pslu_freeable, &CS, &VInfo, &PS, - commLvls[jSep], symb_comm); -#if ( PROFlevel>=1 ) - t1 = SuperLU_timer_(); - time_lvls[3*lvl+1] = t1 - t2; -#endif - if (VInfo.filledSep != FILLED_SEPS) - intraLvl_symbfact - (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstP, lstP, - fstVtx, lstVtx, Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, - tempArray, &mark, &nextl, &nextu, &neltsZr, &neltsTotal, - &nsuper_loc, commLvls[jSep], symb_comm); -#if ( PROFlevel>=1 ) - t2 = SuperLU_timer_(); - time_lvls[3*lvl+2] = t2 - t1; -#endif - } - } - fstP += npNode; - } - iSep += szSep; - szSep = szSep / 2; - lvl ++; - } - - SUPERLU_FREE( tempArray ); - - /* Set up global information and collect statistics */ - if (PS.maxSzLPr < Llu_symbfact.indLsubPr) - PS.maxSzLPr = Llu_symbfact.indLsubPr; - if (PS.maxSzUPr < Llu_symbfact.indUsubPr) - PS.maxSzUPr = Llu_symbfact.indUsubPr; - - Llu_symbfact.xlsub[VInfo.nvtcs_loc] = nextl; - Llu_symbfact.xusub[VInfo.nvtcs_loc] = nextu; - fill_rcmd = SUPERLU_MAX( nextl / nnz_ainf_loc, nextu / nnz_asup_loc) + 1; - Pslu_freeable->xsup_beg_loc = intMalloc_dist (nsuper_loc+1); - Pslu_freeable->xsup_end_loc = intMalloc_dist (nsuper_loc+1); - if (!Pslu_freeable->xsup_beg_loc || !Pslu_freeable->xsup_end_loc) { - fprintf (stderr, "Malloc fails for xsup_beg_loc, xsup_end_loc."); - return (PS.allocMem); - } - PS.allocMem += 2 * (nsuper_loc+1) * sizeof(int_t); - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - nnzL = 0; nnzU = 0; - - i = 0; - nsuper = 0; - ind_blk = 0; - for (ind_blk = 0; ind_blk < VInfo.nblks_loc; ind_blk ++) { - fstVtx = VInfo.begEndBlks_loc[2 * ind_blk]; - lstVtx = VInfo.begEndBlks_loc[2 * ind_blk + 1]; - fstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[fstVtx] ); - nsuper = Pslu_freeable->supno_loc[fstVtx_lid]; - Pslu_freeable->xsup_beg_loc[nsuper] = fstVtx; - szsn = 1; - if (INT_MAX - nnzL <= Llu_symbfact.xlsub[fstVtx_lid + 1] - - Llu_symbfact.xlsub[fstVtx_lid]) - printf ("PE[%d] ERR nnzL %d\n", iam, nnzL); - if (INT_MAX - nnzU <= Llu_symbfact.xusub[fstVtx_lid + 1] - - Llu_symbfact.xusub[fstVtx_lid]) - printf ("PE[%d] ERR nnzU %d\n", iam, nnzU); - - j = Llu_symbfact.xlsub[fstVtx_lid + 1] - Llu_symbfact.xlsub[fstVtx_lid]; - k = Llu_symbfact.xusub[fstVtx_lid + 1] - Llu_symbfact.xusub[fstVtx_lid]; - nnzL += j; - nnzU += k; - - for (vtx = fstVtx + 1, vtx_lid = fstVtx_lid + 1; - vtx < lstVtx; vtx++, vtx_lid ++) { - if (Pslu_freeable->supno_loc[vtx_lid] != nsuper) { - nsuper = Pslu_freeable->supno_loc[vtx_lid]; - Pslu_freeable->xsup_end_loc[nsuper-1] = vtx; - Pslu_freeable->xsup_beg_loc[nsuper] = vtx; - szsn = 1; - j = Llu_symbfact.xlsub[vtx_lid + 1] - Llu_symbfact.xlsub[vtx_lid]; - k = Llu_symbfact.xusub[vtx_lid + 1] - Llu_symbfact.xusub[vtx_lid]; - } - else { - szsn ++; - } - nnzL += j - szsn + 1; - nnzU += k - szsn + 1; - } - Pslu_freeable->xsup_end_loc[nsuper] = lstVtx; - } - Pslu_freeable->supno_loc[VInfo.nvtcs_loc] = nsuper_loc; - Pslu_freeable->nvtcs_loc = VInfo.nvtcs_loc; - - /* set up xsup data */ - Pslu_freeable->lsub = Llu_symbfact.lsub; - Pslu_freeable->xlsub = Llu_symbfact.xlsub; - Pslu_freeable->usub = Llu_symbfact.usub; - Pslu_freeable->xusub = Llu_symbfact.xusub; - Pslu_freeable->szLsub = Llu_symbfact.szLsub; - Pslu_freeable->szUsub = Llu_symbfact.szUsub; - -#if ( PROFlevel>=1 ) - t_symbFact_loc[1] = SuperLU_timer_() - t_symbFact_loc[1]; -#endif - -#if ( PRNTlevel>=1 ) - estimate_memUsage (n, iam, symb_mem_usage, - &totalMemLU, &overestimMem, - Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS); - stat_loc[0] = (float) nnzL; - stat_loc[1] = (float) nnzU; - stat_loc[2] = (float) nsuper_loc; - stat_loc[3] = (float) Pslu_freeable->xlsub[VInfo.nvtcs_loc]; - stat_loc[4] = (float) Pslu_freeable->xusub[VInfo.nvtcs_loc]; - stat_loc[5] = totalMemLU; - stat_loc[6] = overestimMem; - stat_loc[7] = totalMemLU - overestimMem; - stat_loc[8] = (float) PS.maxSzBuf; - stat_loc[9] = (float) PS.nDnsUpSeps; - stat_loc[10] = (float) PS.nDnsCurSep; - stat_loc[11] = (float) (Llu_symbfact.no_expand + Llu_symbfact.no_expcp + - Llu_symbfact.no_expand_pr); - stat_loc[12] = (float) Llu_symbfact.no_expand; - stat_loc[13] = (float) Llu_symbfact.no_expcp; - stat_loc[14] = (float) Llu_symbfact.no_expand_pr; - stat_loc[15] = (float) fill_rcmd; - stat_loc[16] = PS.nops; - stat_loc[17] = PS.fill_pelt[1]; - stat_loc[18] = PS.fill_pelt[4]; - stat_loc[19] = PS.fill_pelt[0]; - stat_loc[20] = PS.fill_pelt[2]; - stat_loc[21] = PS.fill_pelt[3]; - stat_loc[22] = PS.fill_pelt[5]; - - MPI_Reduce (stat_loc, stat_glob, 23, MPI_FLOAT, - MPI_SUM, 0, (*symb_comm)); - MPI_Reduce (&(stat_loc[5]), mem_glob, 14, MPI_FLOAT, - MPI_MAX, 0, (*symb_comm)); - fill_rcmd = (int_t) mem_glob[10]; - PS.fill_pelt[0] = stat_glob[19]; - PS.fill_pelt[1] = mem_glob[12]; - PS.fill_pelt[2] = stat_glob[20]; - PS.fill_pelt[3] = stat_glob[21]; - PS.fill_pelt[4] = mem_glob[13]; - PS.fill_pelt[5] = stat_glob[22]; - if (PS.fill_pelt[2] == 0.) PS.fill_pelt[2] = 1.; - if (PS.fill_pelt[5] == 0.) PS.fill_pelt[5] = 1.; - -#if ( PROFlevel>=1 ) - MPI_Reduce (t_symbFact_loc, t_symbFact, 3, MPI_DOUBLE, - MPI_MAX, 0, (*symb_comm)); - MPI_Gather (time_lvls, 3 * nlvls, MPI_DOUBLE, - time_lvlsT, 3 * nlvls , MPI_DOUBLE, - 0, (*symb_comm)); -#endif - - stat_msgs_l[0] = (float) PS.maxsz_msgSnd; - stat_msgs_l[1] = (float) PS.maxsz_msgSnd; - if (PS.maxsz_msgSnd < PS.maxsz_msgCol) - stat_msgs_l[1] = PS.maxsz_msgCol; - stat_msgs_l[2] = PS.no_shmSnd + PS.no_msgsSnd + - PS.no_shmRcvd + PS.no_msgsRcvd; - stat_msgs_l[3] = stat_msgs_l[2] + PS.no_msgsCol; - stat_msgs_l[4] = stat_msgs_l[2]; - stat_msgs_l[5] = stat_msgs_l[3]; - stat_msgs_l[6] = PS.no_msgsSnd; - stat_msgs_l[7] = PS.no_msgsSnd + PS.no_msgsCol; - stat_msgs_l[8] = PS.sz_msgsSnd; - stat_msgs_l[9] = PS.sz_msgsSnd + PS.sz_msgsCol; - MPI_Reduce (stat_msgs_l, stat_msgs_g, 4, MPI_FLOAT, - MPI_MAX, 0, (*symb_comm)); - MPI_Reduce (&(stat_msgs_l[4]), &(stat_msgs_g[4]), 6, MPI_FLOAT, - MPI_SUM, 0, (*symb_comm)); - if (stat_msgs_g[6] == 0) stat_msgs_g[6] = 1; - if (stat_msgs_g[7] == 0) stat_msgs_g[7] = 1; - - if (!iam) { - nnzL = (int_t) stat_glob[0]; nnzU = (int_t) stat_glob[1]; - nsuper = (int_t) stat_glob[2]; - szLGr = (int_t) stat_glob[3]; szUGr = (int_t) stat_glob[4]; - printf("\tMax szBlk %ld\n", VInfo.maxSzBlk); -#if ( PRNTlevel>=2 ) - printf("\t relax_gen %.2f, relax_curSep %.2f, relax_seps %.2f\n", - PS.relax_gen, PS.relax_curSep, PS.relax_seps); -#endif - printf("\tParameters: fill mem %ld fill pelt %ld\n", - sp_ienv_dist(6), PS.fill_par); - printf("\tNonzeros in L %ld\n", nnzL); - printf("\tNonzeros in U %ld\n", nnzU); - printf("\tnonzeros in L+U-I %ld\n", - nnzL + nnzU); - printf("\tNo of supers %ld\n", nsuper); - printf("\tSize of G(L) %ld\n", szLGr); - printf("\tSize of G(U) %ld\n", szUGr); - printf("\tSize of G(L+U) %ld\n", szLGr+szUGr); - - printf("\tParSYMBfact (MB) :\tL\\U MAX %.2f\tAVG %.2f\n", - mem_glob[0]*1e-6, - stat_glob[5]/nprocs_symb*1e-6); -#if ( PRNTlevel>=2 ) - printf("\tRL overestim (MB):\tL\\U MAX %.2f\tAVG %.2f\n", - mem_glob[1]*1e-6, - stat_glob[6]/nprocs_symb*1e-6); - printf("\tsnd/rcv buffers (MB):\tL\\U MAX %.2f\tAVG %.2f\n", - mem_glob[3]*1e-6, - stat_glob[8]/nprocs_symb*1e-6); - printf("\tSYMBfact 2*n+4*nvtcs_loc+2*maxNvtcsNds_loc:\tL\\U %.2f\n", - (float) (2 * n * sizeof(int_t)) *1e-6); - printf("\tint_t %d, int %d, long int %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(int), sizeof(long int), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tDNS ALLSEPS:\t MAX %d\tAVG %.2f\n", - (int_t) mem_glob[4], stat_glob[9]/nprocs_symb); - printf("\tDNS CURSEP:\t MAX %d\tAVG %.2f\n\n", - (int_t) mem_glob[5], stat_glob[10]/nprocs_symb); - - printf("\t MAX FILL Mem(L+U) / Mem(A) per processor %ld\n", fill_rcmd); - printf("\t Per elt MAX %ld AVG %ld\n", - (int_t) PS.fill_pelt[4], (int_t)(PS.fill_pelt[3]/PS.fill_pelt[5])); - printf("\t Per elt RL MAX %ld AVG %ld\n", - (int_t) PS.fill_pelt[1], (int_t)(PS.fill_pelt[0]/PS.fill_pelt[2])); - printf("\tM Nops:\t MAX %.2f\tAVG %.2f\n", - mem_glob[11]*1e-6, (stat_glob[16]/nprocs_symb)*1e-6); - - - printf("\tEXPANSIONS: MAX/AVG\n"); - printf("\tTOTAL: %d / %.2f\n", - (int_t) mem_glob[6], stat_glob[11]/nprocs_symb); - printf("\tREALLOC: %.f / %.2f RL_CP %.f / %.2f PR_CP %.f / %.2f\n", - mem_glob[7], stat_glob[12]/nprocs_symb, - mem_glob[8], stat_glob[13]/nprocs_symb, - mem_glob[9], stat_glob[14]/nprocs_symb); - - printf ("\n\tDATA MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n", - stat_msgs_g[2]*1e-3, stat_msgs_g[4]/nprocs_symb*1e-3, - stat_msgs_g[0]*1e-6, stat_msgs_g[8] / stat_msgs_g[6]*1e-6); - printf ("\tTOTAL MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n", - stat_msgs_g[3]*1e-3, stat_msgs_g[5]/nprocs_symb*1e-3, - stat_msgs_g[1]*1e-6, stat_msgs_g[9]/stat_msgs_g[7]*1e-6); -#endif - -#if ( PROFlevel>=1 ) - printf("Distribute matrix time = %8.3f\n", t_symbFact[0]); - printf("Count vertices time = %8.3f\n", t_symbFact[2]); - printf("Symbfact DIST time = %8.3f\n", t_symbFact[1]); - - printf("\nLvl\t Time\t Init\t Inter\t Intra\n"); - time_lvlsg[0] = 0.; - for (i = 0; i < nlvls; i++) { - for (j = 1; j < 9; j++) - time_lvlsg[j] = 0.; - for (p = 0; p < nprocs_symb; p++) { - k = p * 3 * nlvls; - t = time_lvlsT[i*3+k] + time_lvlsT[i*3+k+1] + time_lvlsT[i*3+k+2]; - if (t > time_lvlsg[1]) { - time_lvlsg[1] = t; j = p; - } - time_lvlsg[2] += t; - if (time_lvlsT[i*3+k] > time_lvlsg[3]) - time_lvlsg[3] = time_lvlsT[i*3+k]; - time_lvlsg[4] += time_lvlsT[i*3+k]; - if (time_lvlsT[i*3+k+1] > time_lvlsg[5]) - time_lvlsg[5] = time_lvlsT[i*3+k+1]; - time_lvlsg[6] += time_lvlsT[i*3+k+1]; - if (time_lvlsT[i*3+k+2] > time_lvlsg[7]) - time_lvlsg[7] = time_lvlsT[i*3+k+2]; - time_lvlsg[8] += time_lvlsT[i*3+k+2]; - } - time_lvlsg[0] += time_lvlsg[1]; - printf ("%d \t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\n", i, - time_lvlsg[1], time_lvlsg[2] / nprocs_symb, - time_lvlsg[3], time_lvlsg[4] / nprocs_symb, - time_lvlsg[5], time_lvlsg[6] /nprocs_symb, - time_lvlsg[7], time_lvlsg[8] / nprocs_symb); - } - printf("\t %8.3f \n", time_lvlsg[0]); -#endif - } -#endif -#if ( PROFlevel>=1 ) - SUPERLU_FREE (time_lvls); - SUPERLU_FREE (time_lvlsT); -#endif - symbfact_free (iam, nprocs_symb, &Llu_symbfact, &VInfo, &CS); - } /* if (iam < nprocs_symb) */ - else { - /* update Pslu_freeable before returning */ - Pslu_freeable->nvtcs_loc = 0; - Pslu_freeable->xlsub = NULL; Pslu_freeable->lsub = NULL; - Pslu_freeable->xusub = NULL; Pslu_freeable->usub = NULL; - Pslu_freeable->supno_loc = NULL; - Pslu_freeable->xsup_beg_loc = NULL; - Pslu_freeable->xsup_end_loc = NULL; - - SUPERLU_FREE( tempArray ); - PS.allocMem -= n * sizeof(int_t); - } - - if (iam < nprocs_symb && nprocs_symb != 1) - freeComm (iam, nprocs_symb, commLvls, symb_comm); - if (commLvls != NULL) - SUPERLU_FREE( commLvls ); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit psymbfact()"); -#endif - - return (- PS.allocMem); -} /* SYMBFACT_DIST */ - - -static int_t -initParmsAndStats -( - psymbfact_stat_t *PS /* Output -statistics*/ -) -/* - * Purpose - * ======= - * Initialize relaxation parameters and statistics variables - */ -{ - int i; - - PS->nDnsCurSep = 0; - PS->nDnsUpSeps = 0; - - PS->relax_gen = 1.0; - PS->relax_curSep = 1.0; - PS->relax_seps = 1.0; - PS->fill_par = sp_ienv_dist(6); - PS->nops = 0.; - PS->no_shmSnd = 0.; - PS->no_msgsSnd = 0.; - PS->maxsz_msgSnd = 0; - PS->sz_msgsSnd = 0.; - PS->no_shmRcvd = 0.; - PS->no_msgsRcvd = 0.; - PS->maxsz_msgRcvd = 0; - PS->sz_msgsRcvd = 0.; - PS->no_msgsCol = 0.; - PS->maxsz_msgCol = 0; - PS->sz_msgsCol = 0.; - - for (i = 0; i < 6; i++) - PS->fill_pelt[i] = 0.; - - PS->estimUSz = 0; - PS->estimLSz = 0; - PS->maxSzLPr = 0; - PS->maxSzUPr = 0; - PS->maxSzBuf = 0; - PS->szDnsSep = 0; - PS->allocMem = 0; -} - -static float -cntsVtcs -( - int_t n, /* Input - order of the input matrix */ - int iam, /* Input - my processor number */ - int nprocs_symb, /* Input - no of processors for symbolic factorization */ - Pslu_freeable_t *Pslu_freeable, /* Input -globToLoc and maxNvtcsPProc */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output -local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - int_t *tempArray, /* Input - temporary storage */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t *sizes, /* Input - sizes of each node in the tree */ - psymbfact_stat_t *PS, /* Input/Output -statistics */ - MPI_Comm *commLvls - ) -/* - * Purpose - * ======= - * - * Computes an estimation of the number of elements in columns of L - * and rows of U. Stores this information in cntelt_vtcs, and it will - * be used in the right-looking symbolic factorization. - */ -{ - int fstP, lstP, szSep, npNode, i, j; - int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk; - int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk; - int_t nelts, nelts_new_blk; - int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc; - int_t *minElt_vtx, *cntelt_vtcs; - - /* Initialization */ - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - cntelt_vtcs = Llu_symbfact->cntelt_vtcs; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = VInfo->nvtcs_loc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n) - minElt_vtx = lsub; - else { - /* allocate memory for minElt_vtx */ - if (!(minElt_vtx = intMalloc_dist(n))) { - fprintf(stderr, "Malloc fails for minElt_vtx[]."); - return (PS->allocMem); - } - PS->allocMem += n * sizeof (int_t); - } - - for (ii = 0; ii < n; ii++) - tempArray[ii] = n; - for (ii = 0; ii < nvtcs_loc; ii++) - cntelt_vtcs[ii] = 0; - - szSep = nprocs_symb; - i = 0; - cur_blk = 0; - vtx_lid = 0; - while (szSep >= 1) { - /* for each level in the separator tree */ - npNode = nprocs_symb / szSep; - fstP = 0; - /* for each node in the level */ - for (j = i; j < i + szSep; j++) { - fstVtx = fstVtxSep[j]; - lstVtx = fstVtx + sizes[j]; - lstP = fstP + npNode; - - if (fstP <= iam && iam < lstP) { - ind_blk = cur_blk; - ii = vtx_lid; - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) { - for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) { - vtx_elt = lsub[jj]; - if (tempArray[vtx_elt] == n) { - tempArray[vtx_elt] = vtx; - } - } - for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) { - vtx_elt = usub[jj]; - if (tempArray[vtx_elt] == n) { - tempArray[vtx_elt] = vtx; - } - } - } - } - if (szSep == nprocs_symb) - vtx_lid = ii; - else { - MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), - (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int) LOG2( npNode )); - PS->sz_msgsCol += (float) (n - fstVtx); - if (PS->maxsz_msgCol < n - fstVtx) - PS->maxsz_msgCol = n - fstVtx; -#endif - - nelts = 0; - for (ii = fstVtx; ii < lstVtx; ii++) - tempArray[ii] = 0; - for (ii = fstVtx; ii < n; ii++) { - if (minElt_vtx[ii] != n) { - if (minElt_vtx[ii] < fstVtx) - nelts ++; - else - tempArray[minElt_vtx[ii]] ++; - if (ii > lstVtx) - tempArray[ii] = minElt_vtx[ii]; - } - } - - ind_blk = cur_blk; - lv = fstVtx; - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - - for (ii = lv; ii < fstVtx_blk; ii++) - nelts += tempArray[ii]; - lv = lstVtx_blk; - - nelts_new_blk = 0; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - nelts_new_blk += tempArray[vtx]; - cntelt_vtcs[vtx_lid] = nelts; - } - nelts += nelts_new_blk; - } - } /* if (szSep != nprocs_symb) */ - cur_blk = ind_blk; - } - fstP += npNode; - } - i += szSep; - szSep = szSep / 2; - } - /* free memory */ - if (minElt_vtx != lsub) { - SUPERLU_FREE (minElt_vtx); - PS->allocMem -= n * sizeof(int_t); - } - return (SUCCES_RET); -} - -static float -symbfact_mapVtcs -( - int iam, /* Input -process number */ - int nprocs_num, /* Input -number of processors */ - int nprocs_symb, /* Input -number of procs for symbolic factorization */ - SuperMatrix *A, /* Input -input distributed matrix A */ - int_t *fstVtxSep, /* Input -first vertex in each separator */ - int_t *sizes, /* Input -size of each separator in the separator tree */ - Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc - computed */ - vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */ - int_t *tempArray, /* Input -temp array of size n = order of the matrix */ - int_t maxSzBlk, /* Input -maximum number of vertices in a block */ - psymbfact_stat_t *PS /* Input/Output -statistics */ - ) -{ -/* - * Purpose - * ======= - * - * symbfact_mapVtcs maps the vertices of the graph of the input - * matrix A on nprocs_symb processors, using the separator tree - * returned by a graph partitioning algorithm from the previous step - * of the symbolic factorization. The number of processors - * nprocs_symb must be a power of 2. - * - * Description of the algorithm - * ============================ - * - * A subtree to subcube algorithm is used first to map the processors - * on the nodes of the separator tree. - * - * For each node of the separator tree, its corresponding vertices - * are distributed on the processors affected to this node, using a - * block cyclic distribution. - * - * After the distribution, fields of the VInfo structure are - * computed. The array globToLoc and maxNvtcsPProc of Pslu_freeable - * are also computed. - * - */ - int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d; - int_t k, n, kk; - int_t fstVtx, lstVtx; - int_t fstVtxBlk, ind_blk; - int_t noVtcsProc, noBlk; - int_t nvtcs_loc; /* number of vertices owned by process iam */ - int_t nblks_loc; /* no of blocks owned by process iam */ - int_t *globToLoc; /* global indexing to local indexing */ - int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx; - int_t *begEndBlks_loc; /* begin and end vertex of each local block */ - int_t *vtcs_pe; /* contains the number of vertices on each processor */ - int *avail_pes; /* contains the processors to be used at each level */ - - n = A->ncol; - /* allocate memory */ - if (!(globToLoc = intMalloc_dist(n + 1))) { - fprintf (stderr, "Malloc fails for globToLoc[]."); - return (PS->allocMem); - } - PS->allocMem += (n+1) * sizeof(int_t); - if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) { - fprintf (stderr, "Malloc fails for avail_pes[]."); - return (PS->allocMem); - } - PS->allocMem += nprocs_symb*sizeof(int); - if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) { - fprintf (stderr, "Malloc fails for vtcs_pe[]."); - return (PS->allocMem); - } - PS->allocMem += nprocs_symb*sizeof(int_t); - - /* Initialization */ - globToLoc[n] = n; - for (p = 0; p < nprocs_symb; p++) { - vtcs_pe[p] = 0; - avail_pes[p] = EMPTY; - } - nvtcs_loc = 0; - nblks_loc = 0; - maxNvtcsNds_loc = 0; - maxNeltsVtx = 0; - - /* distribute data among processors */ - szSep = nprocs_symb; - iSep = 0; - while (szSep >= 1) { - /* for each level in the separator tree */ - npNode = nprocs_symb / szSep; - firstP = 0; - nvtcsNds_loc = 0; - - for (jSep = iSep; jSep < iSep + szSep; jSep++) { - /* for each node in the level */ - fstVtx = fstVtxSep[jSep]; - lstVtx = fstVtx + sizes[jSep]; - if (firstP <= iam && iam < firstP + npNode) - maxNeltsVtx += lstVtx - fstVtx; - - if (szSep == nprocs_symb) { - /* leaves of the separator tree */ - for (k = fstVtx; k < lstVtx; k++) { - globToLoc[k] = (int_t) firstP; - vtcs_pe[firstP] ++; - } - if (firstP == iam) { - nvtcs_loc += lstVtx - fstVtx; - nblks_loc ++; - } - } - else { - /* superior levels of the separator tree */ - k = fstVtx; - noVtcsProc = maxSzBlk; - fstVtxBlk = fstVtx; - if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode; - /* first allocate processors from previous levels */ - for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) { - p = avail_pes[ind_ap_s]; - if (p != EMPTY && k < lstVtx) { - /* for each column in the separator */ - avail_pes[ind_ap_s] = EMPTY; - kk = 0; - while (kk < noVtcsProc && k < lstVtx) { - globToLoc[k] = p; - vtcs_pe[p] ++; - k ++; - kk ++; - } - if (p == iam) { - nvtcs_loc += kk; - nblks_loc ++; - nvtcsNds_loc += kk; - } - } - else { - if (p != EMPTY && k == lstVtx) { - avail_pes[ind_ap_s] = EMPTY; - avail_pes[ind_ap_d] = p; ind_ap_d ++; - } - } - } - noBlk = 0; - p = firstP + npNode; - while (k < lstVtx) { - /* for each column in the separator */ - kk = 0; - p = (int) (noBlk % (int_t) npNode) + firstP; - while (kk < noVtcsProc && k < lstVtx) { - globToLoc[k] = p; - vtcs_pe[p] ++; - k ++; - kk ++; - } - if (p == iam) { - nvtcs_loc += kk; - nblks_loc ++; - nvtcsNds_loc += kk; - } - noBlk ++; - } /* while (k < lstVtx) */ - /* Add the unused processors to the avail_pes list of pes */ - for (p = p + 1; p < firstP + npNode; p ++) { - avail_pes[ind_ap_d] = p; ind_ap_d ++; - } - } - firstP += npNode; - } - if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb) - maxNvtcsNds_loc = nvtcsNds_loc; - iSep += szSep; - szSep = szSep / 2; - } - -#if ( PRNTlevel>=2 ) - if (!iam) - PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe); -#endif - /* determine maximum number of vertices among processors */ - maxNvtcsPProc = vtcs_pe[0]; - vtcs_pe[0] = 0; - for (p = 1; p < nprocs_symb; p++) { - if (maxNvtcsPProc < vtcs_pe[p]) - maxNvtcsPProc = vtcs_pe[p]; - vtcs_pe[p] = 0; - } -#if ( PRNTlevel>=2 ) - if (!iam) - printf (" MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", - maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n); -#endif - - if (iam < nprocs_symb) - if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1))) - ABORT("Malloc fails for begEndBlks_loc[]."); - - ind_blk = 0; - k = 0; - while (k < n) { - p = globToLoc[k]; - if (p == iam) - begEndBlks_loc[ind_blk] = k; - while (globToLoc[k] == p && k < n) { - globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p]; - vtcs_pe[p] ++; - k ++; - } - if (p == iam) { - begEndBlks_loc[ind_blk + 1] = k; - ind_blk += 2; - } - } - if (iam < nprocs_symb) - begEndBlks_loc[2 * nblks_loc] = n; - - SUPERLU_FREE (avail_pes); - SUPERLU_FREE (vtcs_pe); - - Pslu_freeable->maxNvtcsPProc = maxNvtcsPProc; - Pslu_freeable->globToLoc = globToLoc; - if (iam < nprocs_symb) { - VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc; - VInfo->nblks_loc = nblks_loc; - VInfo->nvtcs_loc = nvtcs_loc; - VInfo->curblk_loc = 0; - VInfo->maxNeltsVtx = maxNeltsVtx; - VInfo->filledSep = FALSE; - VInfo->xlsub_nextLvl = 0; - VInfo->xusub_nextLvl = 0; - VInfo->begEndBlks_loc = begEndBlks_loc; - VInfo->fstVtx_nextLvl = begEndBlks_loc[0]; - } - return SUCCES_RET; -} - -static void -symbfact_distributeMatrix -( - int iam, /* Input - my processor number */ - int nprocs_num, /* Input - number of processors */ - int nprocs_symb, /* Input - number of processors for the - symbolic factorization */ - SuperMatrix *A, /* Input - input matrix A */ - int_t *perm_c, /* Input - column permutation */ - int_t *perm_r, /* Input - row permutation */ - matrix_symbfact_t *AS, /* Output - temporary storage for the - redistributed matrix */ - Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices - distribution */ - int_t *tempArray, /* Input/Output - temporary array of size n - (order of the matrix) */ - MPI_Comm *num_comm /* Input - communicator for nprocs_num procs */ - ) -{ -/* - * Purpose - * ======= - * - * Distribute input matrix A for the symbolic factorization routine. - * Only structural information is distributed. The redistributed - * matrix has its rows and columns permuted according to perm_r and - * perm_c. A is not modified during this routine. - * - */ -/* Notations: - * Ainf : inferior part of A, including diagonal. - * Asup : superior part of A. - */ - int p, p_irow, code_err, ainf_data; - int_t n, m_loc, fst_row; - int_t i, j, k, irow, jcol; - NRformat_loc *Astore; - int_t nnz_loc, nnz_iam; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be received */ - /* number of nonzeros to send/receive per processor */ - int_t *nnzToSend, *nnzToRecv; - int_t *nnzAinf_toSnd; /* nnz in Ainf to send */ - /* VInfo data structures */ - int_t *globToLoc, *begEndBlks_loc, nblks_loc, nvtcs_loc, maxNvtcsPProc; - - int_t neltsRow, vtx, vtx_lid, nelts, ind; - int_t *snd_aind, *rcv_aind; - int_t *ptr_toSnd, *buf, *ptr_toRcv; - /* matrix_symbfact_t *As data */ - int_t *x_ainf, *x_asup, *ind_ainf, *ind_asup; - int *intBuf1, *intBuf2, *intBuf3, *intBuf4; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - nnzToRecv = intCalloc_symbfact(3 * (int_t)nprocs_num); - nnzToSend = nnzToRecv + nprocs_num; - nnzAinf_toSnd = nnzToRecv + 2 * nprocs_num; - - /* --------------------------------------------------------------------- - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE - SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. - ----------------------------------------------------------------------*/ - /* tempArray stores the number of nonzeros in each column of ainf */ - for (i = 0; i < n; i++) - tempArray[i] = 0; - for (i = 0; i < m_loc; i++) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - p_irow = OWNER(globToLoc[irow]); - neltsRow = 0; - - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) { - jcol = perm_c[Astore->colind[j]]; - if (jcol <= irow) { - p = OWNER(globToLoc[jcol]); - if (tempArray[jcol] == 0) { - nnzToSend[p] += 2; - nnzAinf_toSnd[p] += 2; - } - tempArray[jcol] ++; - nnzAinf_toSnd[p] ++; - } - else { - p = p_irow; - neltsRow ++; - } - nnzToSend[p] ++; - } - if (neltsRow != 0) { - nnzToSend[p_irow] += 2; - } - } - - /* add one entry which will separate columns of Ainf from rows - of Asup */ - for (p = 0; p < nprocs_num; p++) - if (nnzToSend[p] != 0) - nnzToSend[p] ++; - - /* All-to-all communication */ - MPI_Alltoall (nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - (*num_comm)); - - nnz_loc = SendCnt = RecvCnt = 0; - for (p = 0; p < nprocs_num; p++) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - } else { - nnz_loc += nnzToRecv[p]; - nnzToSend[p] = 0; - } - } - nnz_iam = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if (!(snd_aind = intMalloc_symbfact(SendCnt)) && SendCnt != 0) - ABORT("Malloc fails for snd_aind[]."); - if ( !(rcv_aind = intMalloc_symbfact(nnz_iam + 1))) - ABORT("Malloc fails for rcv_aind[]."); - if ( !(ptr_toSnd = intCalloc_symbfact((int_t) nprocs_num)) ) - ABORT("Malloc fails for ptr_toSnd[]."); - if ( !(ptr_toRcv = intCalloc_symbfact((int_t) nprocs_num)) ) - ABORT("Malloc fails for ptr_toRcv[]."); - - /* setup ptr_toSnd[p] to point to data in snd_aind to be send to - processor p */ - for (i = 0, j = 0, p = 0; p < nprocs_num; p++) { - if ( p != iam ) - ptr_toSnd[p] = i; - else - ptr_toSnd[p] = j; - i += nnzToSend[p]; - j += nnzToRecv[p]; - } - - for (i = 0; i < n; i++) { - if (tempArray[i] != 0) { - /* column i of Ainf will be send to a processor */ - p = OWNER( globToLoc[i] ); - if (p == iam) { - buf = &(rcv_aind[ptr_toSnd[p]]); - } - else { - buf = &(snd_aind[ptr_toSnd[p]]); - } - buf[0] = tempArray[i]; - buf[1] = i; - tempArray[i] = ptr_toSnd[p] + 2; - ptr_toSnd[p] += 2 + buf[0]; - } - } - - /* set ptr_toSnd to point to Asup data (stored by rows) */ - for (i = 0, j = 0, p = 0; p < nprocs_num; p++) { - if ( p != iam ) { - if (nnzToSend[p] != 0) { - snd_aind[i + nnzAinf_toSnd[p]] = EMPTY; - ptr_toSnd[p] = i + nnzAinf_toSnd[p] + 1; - } - } - else { - if (nnzToRecv[p] != 0) { - rcv_aind[j + nnzAinf_toSnd[p]] = EMPTY; - ptr_toSnd[p] = j + nnzAinf_toSnd[p] + 1; - } - } - i += nnzToSend[p]; - j += nnzToRecv[p]; - } - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE snd_aind STRUCTURE TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - For each processor, we store first the columns to be sent, - and then the rows to be sent. For each row/column sent: - entry 0 : x = number of elements in that row/column - entry 1 : row/column number - entries 2 .. x + 2 : row/column indices. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; i++) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*A */ - p_irow = OWNER( globToLoc[irow] ); - ptr_toSnd[p_irow] +=2; - neltsRow = 0; - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) { - jcol = perm_c[Astore->colind[j]]; - if (jcol <= irow) { - p = OWNER( globToLoc[jcol] ); - k = tempArray[jcol]; - tempArray[jcol] ++; - if (p == iam) { /* local */ - rcv_aind[k] = irow; - } - else { - snd_aind[k] = irow; - } - } - else { - p = p_irow; - neltsRow ++; - k = ptr_toSnd[p]; - ptr_toSnd[p] ++; - if (p == iam) { /* local */ - rcv_aind[k] = jcol; - } - else { - snd_aind[k] = jcol; - } - } - } - - if (neltsRow == 0) - ptr_toSnd[p_irow] -= 2; - else { - /* store entry 0 and entry 1 */ - if (p_irow == iam) { /* local */ - rcv_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow; - rcv_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow; - } - else { /* remote */ - snd_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow; - snd_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow; - } - } - } - - /* reset ptr_toSnd to point to the beginning of the data for - each processor (structure needed in MPI_Alltoallv */ - for (i = 0, j = 0, p = 0; p < nprocs_num; p++) { - ptr_toSnd[p] = i; - i += nnzToSend[p]; - ptr_toRcv[p] = j; - j += nnzToRecv[p]; - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - Note: it uses MPI_Alltoallv. - ------------------------------------------------------------*/ - if (nprocs_num > 1) { -#if defined (_LONGINT) - intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs_num * sizeof(int)); - intBuf2 = intBuf1 + nprocs_num; - intBuf3 = intBuf1 + 2 * nprocs_num; - intBuf4 = intBuf1 + 3 * nprocs_num; - - for (p=0; p INT_MAX || ptr_toSnd[p] > INT_MAX || - nnzToRecv[p] > INT_MAX || ptr_toRcv[p] > INT_MAX) - ABORT("ERROR in symbfact_distributeMatrix size to send > INT_MAX\n"); - intBuf1[p] = (int) nnzToSend[p]; - intBuf2[p] = (int) ptr_toSnd[p]; - intBuf3[p] = (int) nnzToRecv[p]; - intBuf4[p] = (int) ptr_toRcv[p]; - } -#else /* Default */ - intBuf1 = nnzToSend; intBuf2 = ptr_toSnd; - intBuf3 = nnzToRecv; intBuf4 = ptr_toRcv; -#endif - - i = nnzToRecv[iam]; /* This corresponds to nnzToRecv[iam] */ - intBuf3[iam] = 0; - intBuf1[iam] = 0; - - MPI_Alltoallv (snd_aind, intBuf1, intBuf2, mpi_int_t, - rcv_aind, intBuf3, intBuf4, mpi_int_t, - (*num_comm)); -#if defined (_LONGINT) - SUPERLU_FREE (intBuf1); -#endif - nnzToRecv[iam] = i; - } - - /* ------------------------------------------------------------ - DEALLOCATE SEND STORAGE - ------------------------------------------------------------*/ - if (snd_aind) SUPERLU_FREE( snd_aind ); - SUPERLU_FREE( ptr_toSnd ); - - /* ------------------------------------------------------------ - CONVERT THE RECEIVED FORMAT INTO THE SYMBOLIC FORMAT. - THIS IS PERFORMED ONLY BY NPROCS_SYMB PROCESSORS - ------------------------------------------------------------*/ - if (iam < nprocs_symb) { - nblks_loc = VInfo->nblks_loc; - begEndBlks_loc = VInfo->begEndBlks_loc; - nvtcs_loc = VInfo->nvtcs_loc; - /* ------------------------------------------------------------ - Allocate space for storing indices of A after redistribution. - ------------------------------------------------------------*/ - if (!(x_ainf = intCalloc_symbfact (nvtcs_loc + 1))) - ABORT("Malloc fails for x_ainf[]."); - if (!(x_asup = intCalloc_symbfact (nvtcs_loc + 1))) - ABORT("Malloc fails for x_asup[]."); - - /* Initialize the array of columns/rows pointers */ - for (i = 0, p = 0; p < nprocs_num; p++) { - ainf_data = TRUE; - k = 0; - while (k < nnzToRecv[p]) { - j = rcv_aind[i + k]; - if (j == EMPTY) { - ainf_data = FALSE; - k ++; - } - else { - nelts = rcv_aind[i + k]; - vtx = rcv_aind[i + k + 1]; - vtx_lid = LOCAL_IND( globToLoc[vtx] ); - k += nelts + 2; - if (ainf_data) - x_ainf[vtx_lid] += nelts; - else - x_asup[vtx_lid] = nelts; - } - } - i += nnzToRecv[p]; - } - - /* copy received information */ - vtx_lid = 0; - for (i = 0, k = 0, j = 0; i < nblks_loc; i++) { - for (vtx = begEndBlks_loc[2*i]; vtx < begEndBlks_loc[2*i+1]; vtx++, vtx_lid ++) { - nelts = x_ainf[vtx_lid]; - x_ainf[vtx_lid] = k; - k += nelts; - nelts = x_asup[vtx_lid]; - x_asup[vtx_lid] = j; - j += nelts; - tempArray[vtx] = x_ainf[vtx_lid]; - } - } - x_ainf[nvtcs_loc] = k; - x_asup[nvtcs_loc] = j; - - /* Allocate space for storing indices of A after conversion */ - if ( !(ind_ainf = intMalloc_symbfact(x_ainf[nvtcs_loc])) && x_ainf[nvtcs_loc] != 0 ) - ABORT("Malloc fails for ind_ainf[]."); - if ( !(ind_asup = intMalloc_symbfact(x_asup[nvtcs_loc])) && x_asup[nvtcs_loc] != 0) - ABORT("Malloc fails for ind_asup[]."); - - /* Copy the data into the row/column oriented storage */ - for (i = 0, p = 0; p < nprocs_num; p++) { - ainf_data = TRUE; - k = 0; - while (k < nnzToRecv[p]) { - j = rcv_aind[i + k]; - if (ainf_data && j == EMPTY) { - ainf_data = FALSE; - k ++; - } - else { - nelts = rcv_aind[i + k]; - vtx = rcv_aind[i + k + 1]; - vtx_lid = LOCAL_IND( globToLoc[vtx] ); - if (ainf_data) { - /* traverse ainf data */ - ind = tempArray[vtx]; - for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) - ind_ainf[ind] = rcv_aind[j]; - tempArray[vtx] = ind; - } - else { - /* traverse asup data */ - ind = x_asup[vtx_lid]; - for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) - ind_asup[ind] = rcv_aind[j]; - } - k += nelts + 2; - } - } - i += nnzToRecv[p]; - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - SUPERLU_FREE( ptr_toRcv ); - if (rcv_aind) SUPERLU_FREE( rcv_aind ); - if (nnzToRecv) SUPERLU_FREE( nnzToRecv ); - - AS->x_ainf = x_ainf; - AS->x_asup = x_asup; - AS->ind_ainf = ind_ainf; - AS->ind_asup = ind_asup; - - VInfo->nnz_asup_loc = x_asup[nvtcs_loc]; - VInfo->nnz_ainf_loc = x_ainf[nvtcs_loc]; - } -} - -static -float allocPrune_lvl -( - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data - structures */ - vtcsInfo_symbfact_t *VInfo, /* Input -local info on vertices - distribution */ - psymbfact_stat_t *PS /* Input -statistics */ - ) -/* - * Allocate storage for data structures necessary for pruned graphs. - * For those unpredictable size, make a guess as FILL * n. - * Return value: - * 0 if enough memory was available; - * otherwise, return the amount of space intended to allocate - * when memory allocation failure occurred. - */ -{ - int_t lword; - int_t nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr; - int_t nvtcs_loc, no_expand_pr, x_sz; - float alpha = 1.5; - int_t FILL = sp_ienv_dist(6); - - nvtcs_loc = VInfo->nvtcs_loc; - - no_expand_pr = 0; - lword = (int_t) sizeof(int_t); - - /* free memory allocated for the domain symbolic factorization */ - if (Llu_symbfact->szLsubPr) - SUPERLU_FREE( Llu_symbfact->lsubPr ); - if (Llu_symbfact->szUsubPr) - SUPERLU_FREE( Llu_symbfact->usubPr ); - if (Llu_symbfact->xlsubPr) - SUPERLU_FREE( Llu_symbfact->xlsubPr ); - if (Llu_symbfact->xusubPr) - SUPERLU_FREE( Llu_symbfact->xusubPr ); - - Llu_symbfact->xlsub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1); - Llu_symbfact->xusub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1); - - /* allocate memory to use during superior levels of sep_tree */ - x_sz = SUPERLU_MIN( VInfo->maxNvtcsNds_loc, VInfo->maxSzBlk); - nzlmaxPr = 2 * FILL * VInfo->maxNvtcsNds_loc; - nzumaxPr = 2 * FILL * VInfo->maxSzBlk; - - /* Integer pointers for L\U factors */ - if (x_sz != 0) { - xlsubPr = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1); - xusubPr = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1); - - lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword); - usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword); - - while ( !lsubPr || !usubPr ) { - if ( lsubPr ) SUPERLU_FREE( lsubPr ); - if ( usubPr ) SUPERLU_FREE( usubPr ); - - nzlmaxPr /= 2; nzlmaxPr = alpha * (float) nzlmaxPr; - nzumaxPr /= 2; nzumaxPr = alpha * (float) nzumaxPr; - - if ( nzumaxPr < x_sz ) { - fprintf(stderr, "Not enough memory to perform factorization.\n"); - return (PS->allocMem); - } - lsubPr = (int_t *) SUPERLU_MALLOC(nzlmaxPr * lword); - usubPr = (int_t *) SUPERLU_MALLOC(nzumaxPr * lword); - ++no_expand_pr; - } - } - else { - xlsubPr = NULL; lsubPr = NULL; - xusubPr = NULL; usubPr = NULL; - nzlmaxPr = 0; nzumaxPr = 0; - } - - if (VInfo->maxNvtcsNds_loc) - Llu_symbfact->cntelt_vtcsA_lvl = - (int_t *) SUPERLU_MALLOC (VInfo->maxNvtcsNds_loc * lword); - - if (PS->maxSzLPr < Llu_symbfact->indLsubPr) - PS->maxSzLPr = Llu_symbfact->indLsubPr; - if (PS->maxSzUPr < Llu_symbfact->indUsubPr) - PS->maxSzUPr = Llu_symbfact->indUsubPr; - - Llu_symbfact->lsubPr = lsubPr; - Llu_symbfact->xlsubPr = xlsubPr; - Llu_symbfact->usubPr = usubPr; - Llu_symbfact->xusubPr = xusubPr; - Llu_symbfact->szLsubPr = nzlmaxPr; - Llu_symbfact->szUsubPr = nzumaxPr; - Llu_symbfact->indLsubPr = 0; - Llu_symbfact->indUsubPr = 0; - - Llu_symbfact->no_expand_pr += no_expand_pr; - return 0; -} - -static float -allocPrune_domain -( - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data - structures */ - vtcsInfo_symbfact_t *VInfo, /* Input -local info on vertices - distribution */ - psymbfact_stat_t *PS /* Input -statistics */ - ) -/* - * Allocate storage for data structures necessary for pruned graphs. - * For those unpredictable size, make a guess as FILL * n. - * Return value: - * 0 if enough memory was available; - * otherwise, return the amount of space intended to allocate - * when memory allocation failure occurred. - */ -{ - int_t lword; - int_t nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr; - int_t nvtcs_loc, no_expand_pr, x_sz; - float alpha = 1.5; - int_t FILL = 2 * sp_ienv_dist(6); - - nvtcs_loc = VInfo->nvtcs_loc; - - no_expand_pr = 0; - lword = (int_t) sizeof(int_t); - - /* allocate memory to use during domain_symbolic routine */ - /* Guess for prune graph */ - x_sz = lstVtx - fstVtx; - nzlmaxPr = nzumaxPr = 2*FILL * x_sz; - - /* Integer pointers for L\U factors */ - if (x_sz != 0) { - xlsubPr = intMalloc_symbfact(x_sz+1); - xusubPr = intMalloc_symbfact(x_sz+1); - - lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword); - usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword); - - while ( !lsubPr || !usubPr ) { - if ( lsubPr ) SUPERLU_FREE(lsubPr); - if ( usubPr ) SUPERLU_FREE(usubPr); - - nzlmaxPr /= 2; nzlmaxPr = alpha * (float) nzlmaxPr; - nzumaxPr /= 2; nzumaxPr = alpha * (float) nzumaxPr; - - if ( nzumaxPr < x_sz ) { - fprintf(stderr, "Not enough memory to perform factorization.\n"); - return (PS->allocMem); - } - lsubPr = (void *) SUPERLU_MALLOC(nzlmaxPr * lword); - usubPr = (void *) SUPERLU_MALLOC(nzumaxPr * lword); - ++no_expand_pr; - } - } - else { - xlsubPr = NULL; - xusubPr = NULL; - } - - Llu_symbfact->lsubPr = lsubPr; - Llu_symbfact->xlsubPr = xlsubPr; - Llu_symbfact->usubPr = usubPr; - Llu_symbfact->xusubPr = xusubPr; - Llu_symbfact->szLsubPr = nzlmaxPr; - Llu_symbfact->szUsubPr = nzumaxPr; - Llu_symbfact->indLsubPr = 0; - Llu_symbfact->indUsubPr = 0; - Llu_symbfact->xlsub_rcvd = NULL; - Llu_symbfact->xusub_rcvd = NULL; - Llu_symbfact->cntelt_vtcsA_lvl = NULL; - - PS->maxSzLPr = Llu_symbfact->indLsubPr; - PS->maxSzUPr = Llu_symbfact->indUsubPr; - - Llu_symbfact->no_expand_pr = no_expand_pr; - Llu_symbfact->no_expcp = 0; - return 0; -} - -/************************************************************************/ -static -int symbfact_alloc -/************************************************************************/ -( - int_t n, /* Input - order of the matrix */ - int nprocs, /* Input - number of processors for the symbolic - factorization */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices - distribution */ - comm_symbfact_t *CS, /* Input -information on communication */ - psymbfact_stat_t *PS /* Input -statistics */ - ) -/* - * Allocate storage for the data structures common to symbolic factorization - * routines. For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * 0 if enough memory was available; - * otherwise, return the amount of space intended to allocate - * when memory allocation failure occurred. - */ -{ - int nlvls, p; /* no of levels in the separator tree */ - int_t lword, no_expand; - int_t *xsup, *supno; - int_t *lsub, *xlsub; - int_t *usub, *xusub; - int_t nzlmax, nzumax, nnz_a_loc; - int_t nvtcs_loc, *cntelt_vtcs; - float alpha = 1.5; - int_t FILL = sp_ienv_dist(6); - - nvtcs_loc = VInfo->nvtcs_loc; - nnz_a_loc = VInfo->nnz_ainf_loc + VInfo->nnz_asup_loc; - nlvls = (int) LOG2( nprocs ) + 1; - no_expand = 0; - lword = sizeof(int_t); - - /* Guess for L\U factors */ - nzlmax = nzumax = FILL * nnz_a_loc; - - /* Integer pointers for L\U factors */ - supno = intMalloc_symbfact(nvtcs_loc+1); - xlsub = intMalloc_symbfact(nvtcs_loc+1); - xusub = intMalloc_symbfact(nvtcs_loc+1); - - lsub = (void *) SUPERLU_MALLOC(nzlmax * lword); - usub = (void *) SUPERLU_MALLOC(nzumax * lword); - - while ( !lsub || !usub ) { - if (!lsub) SUPERLU_FREE(lsub); - if (!usub) SUPERLU_FREE(usub); - - nzlmax /= 2; nzlmax = alpha * nzlmax; - nzumax /= 2; nzumax = alpha * nzumax; - - if ( nzumax < nnz_a_loc/2 ) { - fprintf(stderr, "Not enough memory to perform factorization.\n"); - return (PS->allocMem); - } - lsub = (void *) SUPERLU_MALLOC(nzlmax * lword); - usub = (void *) SUPERLU_MALLOC(nzumax * lword); - ++no_expand; - } - - if (nprocs == 1) - cntelt_vtcs = NULL; - else - cntelt_vtcs = intMalloc_symbfact (nvtcs_loc+1); - - /* allocate memory for communication data structures */ - CS->rcv_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1); - CS->snd_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1); - CS->ptr_rcvBuf = intMalloc_symbfact (2 * (int_t) nprocs ); - CS->rcv_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1); - CS->snd_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1); - - CS->snd_interSz = intMalloc_symbfact ((int_t) nlvls + 1); - CS->snd_LinterSz = intMalloc_symbfact ((int_t) nlvls + 1); - CS->snd_vtxinter = intMalloc_symbfact ((int_t) nlvls + 1); - CS->rcv_bufSz = 0; - CS->rcv_buf = NULL; - CS->snd_bufSz = 0; - CS->snd_buf = NULL; - - for (p = 0; p < nprocs; p++) { - CS->rcv_interLvl[p] = EMPTY; - CS->snd_interLvl[p] = EMPTY; - CS->rcv_intraLvl[p] = EMPTY; - CS->snd_intraLvl[p] = EMPTY; - } - - for (p = 0; p <= nlvls; p++) { - CS->snd_vtxinter[p] = EMPTY; - CS->snd_interSz[p] = 0; - CS->snd_LinterSz[p] = 0; - } - - Pslu_freeable->supno_loc = supno; - Llu_symbfact->lsub = lsub; - Llu_symbfact->xlsub = xlsub; - Llu_symbfact->usub = usub; - Llu_symbfact->xusub = xusub; - Llu_symbfact->szLsub = nzlmax; - Llu_symbfact->szUsub = nzumax; - Llu_symbfact->cntelt_vtcs = cntelt_vtcs; - - Llu_symbfact->no_expand = no_expand; - - return SUCCES_RET; -} /* SYMBFACT_ALLOC */ - -static int_t -symbfact_vtx -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int_t vtx, /* Input - vertex number to perform symbolic factorization */ - int_t vtx_lid, /* Input - local vertex number */ - int_t vtx_prid, /* Input - */ - int_t computeL, /* Input - TRUE when compute column L(:,vtx) - otheriwse compute row U(vtx, :) */ - int domain_symb, /* Input - if TRUE, computation corresponds to the independent - domain at the bottom of the separator tree */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - int_t snrep_lid, /* local index of current supernode reprezentative */ - int_t szSn, /* size of supernode with snrep_lid reprezentative */ - int_t *p_next, /* next element in sub structure */ - int_t *marker, - int_t *sub_rcvd, /* elements of node */ - int_t sub_rcvd_sz, /* size of sub to be explored */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS, - int_t *p_neltsVtxInit, - int_t *p_neltsVtx, - int_t *p_neltsVtx_CSep, - int_t *p_neltsZrVtx, - int_t *p_neltsMatched, - int_t mark_vtx, - int_t *p_prval_curvtx, - int_t vtx_bel_othSn, - int_t *p_vtx_bel_mySn - ) -{ - int_t x_aind_beg, x_aind_end; - int_t k, vtx_elt, ind, pr, pr_lid, mem_error, ii, jj, compRcvd; - int_t *xsub, *sub, *xsubPr, *subPr, *xsub_rcvd, *xsub_src, *sub_src; - int_t pr_elt, next, prval_curvtx, maxNvtcsPProc; - int_t neltsVtx, neltsMatched, neltsZrVtx, neltsZrSn, neltsVtx_CSep; - int_t neltsVtxInit, kk; - int diagind, upd_lstSn; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - upd_lstSn = FALSE; - diagind = FALSE; - prval_curvtx = *p_prval_curvtx; - neltsVtx_CSep = 0; - next = *p_next; - if (computeL) { - xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub; - xsub_rcvd = Llu_symbfact->xlsub_rcvd; - xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr; - } - else { - xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub; - xsub_rcvd = Llu_symbfact->xusub_rcvd; - xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr; - } - - x_aind_beg = xsub[vtx_lid]; - x_aind_end = xsub[vtx_lid + 1]; - xsub[vtx_lid] = next; - k = x_aind_beg; - /* while (sub[k] != EMPTY && k < x_aind_end) { */ - while (k < x_aind_end) { - if (sub[k] == EMPTY) - k = x_aind_end; - else { - vtx_elt = sub[k]; - if (!computeL) - if (marker[vtx_elt] == mark_vtx - 2) - if (vtx_elt < prval_curvtx) - prval_curvtx = vtx_elt; - marker[vtx_elt] = mark_vtx; - if (computeL && vtx_elt == vtx) - diagind = TRUE; - if (!computeL && vtx_elt == vtx) - printf ("Pe[%d] ERROR diag elt in U part vtx %d dom_s %d fstV %d lstV %d\n", - iam, vtx, domain_symb, fstVtx, lstVtx); - else { - sub[next] = vtx_elt; - next ++; - } - if (vtx_elt < lstVtx) neltsVtx_CSep ++; - k++; - } - } - neltsVtxInit = k - x_aind_beg; - PS->nops += neltsVtxInit; - - if (domain_symb) { - if (computeL) - VInfo->nnz_ainf_loc -= x_aind_end - x_aind_beg; - else - VInfo->nnz_asup_loc -= x_aind_end - x_aind_beg; - } - -#ifdef TEST_SYMB - printf ("compL %d vtx %d vtx_lid %d vtx_prid %d vtx_bel_othSn %d\n", - computeL, vtx, vtx_lid, vtx_prid, vtx_bel_othSn); - PrintInt10 ("A(:, v)", x_aind_end - x_aind_beg, &(sub[xsub[vtx_lid]])); -#endif - - ind = xsubPr[vtx_prid]; - if (vtx_bel_othSn == vtx) - upd_lstSn = TRUE; - - while (ind != EMPTY || upd_lstSn) { - if (upd_lstSn ) { - upd_lstSn = FALSE; - pr_lid = snrep_lid; - } - else { - pr_lid = subPr[ind]; - ind = subPr[ind - 1]; - } - - if (!computeL) - marker[vtx] = mark_vtx; - if (pr_lid >= VInfo->nvtcs_loc) { - compRcvd = TRUE; - xsub_src = xsub_rcvd; sub_src = sub_rcvd; - pr_lid -= VInfo->nvtcs_loc; - k = xsub_src[pr_lid] + RCVD_IND; - } - else { - compRcvd = FALSE; - xsub_src = xsub; sub_src = sub; - k = xsub_src[pr_lid]; - } - - PS->nops += xsub_src[pr_lid+1] - xsub_src[pr_lid]; - for (; k < xsub_src[pr_lid+1]; k++) { - pr_elt = sub_src[k]; - if (pr_elt >= vtx && marker[pr_elt] != mark_vtx) { - - /* TEST available memory */ - if (next >= x_aind_end) { - if (domain_symb) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, 0, - computeL, DOMAIN_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - } else if (mem_error = - psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, - computeL, LL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - - x_aind_end = xsub[vtx_lid + 1]; - if (computeL) sub = Llu_symbfact->lsub; - else sub = Llu_symbfact->usub; - if (!compRcvd) - sub_src = sub; - } - - sub[next] = pr_elt; next ++; - - if (pr_elt < lstVtx) neltsVtx_CSep ++; - if (computeL && pr_elt == vtx) - diagind = TRUE; - if (!computeL) - if (marker[pr_elt] == mark_vtx - 2) - if (pr_elt < prval_curvtx) - prval_curvtx = pr_elt; - marker[pr_elt] = mark_vtx; - } - } - } - - /* Abort if the diagonal element is zero */ - if (computeL && diagind == FALSE) { - printf("Pe[%d] At column %d, ", iam, vtx); - ABORT("ParSymbFact() encounters zero diagonal"); - } - - neltsVtx = next - xsub[vtx_lid]; - neltsZrVtx = 0; /* number of zero elements which would - be introduced in the vertex */ - neltsZrSn = 0; /* -"- in the supernode */ - neltsMatched = 0; - if (vtx != fstVtx) { - for (k = xsub[snrep_lid]; k < xsub[snrep_lid+1]; k++) { - vtx_elt = sub[k]; - if (vtx_elt >= vtx) { - if ((vtx_elt > vtx && !computeL) || - (vtx_elt >= vtx && computeL)) { - if (marker[vtx_elt] != mark_vtx) - neltsZrVtx ++; - else { - neltsMatched ++; - } - } - if (computeL && vtx_elt == vtx) - *p_vtx_bel_mySn = vtx; - if (!computeL && vtx_elt == vtx + 1) - *p_vtx_bel_mySn = vtx + 1; - } - } - } - else { - neltsMatched = neltsVtx; - if (! computeL) - for (k = xsub[vtx_lid]; k < next; k++) { - vtx_elt = sub[k]; - if (vtx_elt == vtx + 1) - *p_vtx_bel_mySn = vtx + 1; - } - } - - *p_neltsVtxInit = neltsVtxInit; - *p_neltsVtx = neltsVtx; - *p_neltsVtx_CSep = neltsVtx_CSep; - *p_neltsZrVtx = neltsZrVtx; - *p_neltsMatched = neltsMatched; - *p_next = next; - *p_prval_curvtx = prval_curvtx; - return SUCCES_RET; -} - -static int_t -updateRcvd_prGraph -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int_t *sub_rcvd, /* elements of node */ - int_t sub_rcvd_sz, /* Input - size of sub to be used in the update */ - int_t fstVtx_toUpd, /* Input - first vertex to update */ - int_t lstVtx_toUpd, /* Input - last vertex to update */ - int_t pr_offset, - int computeL, - int_t *marker, - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - psymbfact_stat_t *PS - /* marker: first elements of marker contain the nodes that will - be used in the updates */ -) -{ - int_t i, k, nelts, prVal, vtx_elt, vtx_elt_lid, ind; - int_t vtx, vtx_lid, fstVtx_toUpd_lid, fstVtx_srcUpd_lid; - int_t *xsub, *sub, *xsub_rcvd, *xsubPr, *subPr, szsubPr, *p_indsubPr; - int_t maxNvtcsPProc, *globToLoc, mem_error; - int_t nvtcs_toUpd, fstVtx_srcUpd, vtx_lid_p; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] ); - nvtcs_toUpd = lstVtx_toUpd - fstVtx_toUpd; - - if (computeL) { - xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub; - xsub_rcvd = Llu_symbfact->xlsub_rcvd; - xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr; - p_indsubPr = &(Llu_symbfact->indLsubPr); - szsubPr = Llu_symbfact->szLsubPr; - } - else { - xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub; - xsub_rcvd = Llu_symbfact->xusub_rcvd; - xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr; - p_indsubPr = &(Llu_symbfact->indUsubPr); - szsubPr = Llu_symbfact->szUsubPr; - } - - /* count number of elements in transpose representation of sub_rcvd */ - /* use marker to count those elements */ - for (i = 0; i < nvtcs_toUpd; i++) - marker[i] = 0; - for (i = 0; i <= VInfo->maxSzBlk; i++) - xsub_rcvd[i] = 0; - - i = 0; - fstVtx_srcUpd = EMPTY; - while (i < sub_rcvd_sz) { - vtx = sub_rcvd[i + DIAG_IND]; - nelts = sub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - prVal = sub_rcvd[i]; - if (fstVtx_srcUpd == EMPTY) fstVtx_srcUpd = vtx; - xsub_rcvd[vtx - fstVtx_srcUpd] = i - RCVD_IND; - xsub_rcvd[vtx-fstVtx_srcUpd+1] = i + nelts; - for (k = i; k < i + nelts; k++) { - vtx_elt = sub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt] ) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - - fstVtx_toUpd_lid; - marker[vtx_elt_lid] ++; - } - } - } - } - i += nelts; - } - - vtx_lid = fstVtx_toUpd_lid - pr_offset; - ind = 0; - for (i = 0; i < nvtcs_toUpd; i++) { - if (marker[i] != 0) { - xsubPr[vtx_lid] = ind + 1; - ind += 2* marker[i]; - marker[i] = xsubPr[vtx_lid] - 1; - } - vtx_lid ++; - } - - if (ind == 0) - /* quick return if no update */ - return; - - /* test if enough memory in usubPr array */ - if (ind >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) - return (mem_error); - if (computeL) - subPr = Llu_symbfact->lsubPr; - else - subPr = Llu_symbfact->usubPr; - } - *p_indsubPr = ind; - - i = 0; - while (i < sub_rcvd_sz) { - vtx = sub_rcvd[i + DIAG_IND]; - nelts = sub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - prVal = sub_rcvd[i]; - for (k = i; k < i + nelts; k++) { - vtx_elt = sub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt] ) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ); - vtx_lid_p = vtx_elt_lid - pr_offset; - vtx_elt_lid -= fstVtx_toUpd_lid; - /* add vtx to structure of pruned graph */ - if (marker[vtx_elt_lid] != xsubPr[vtx_lid_p] - 1) - subPr[marker[vtx_elt_lid] - 2] = marker[vtx_elt_lid] + 1; - subPr[marker[vtx_elt_lid] + 1] = vtx - fstVtx_srcUpd + VInfo->nvtcs_loc; - subPr[marker[vtx_elt_lid]] = EMPTY; - marker[vtx_elt_lid] += 2; - } - } - } - } - i += nelts; - } - - for (i = fstVtx_toUpd; i < nvtcs_toUpd; i++) - marker[i] = 0; -} - -static int_t -update_prGraph -( - int iam, - int_t n, /* order of the matrix */ - int_t fstVtx_blk, /* first vertex in block to factorize */ - int_t lstVtx_blk, /* last vertex in block to factorize */ - int_t snrep_lid, /* local index of current supernode reprezentative */ - int_t pr_offset, /* offset in the indexing of prune structure */ - int_t prval_cursn, /* prune value of current supernode reprezentative */ - int_t xsub_snp1, /* denotes xsub[snrep_lid + 1] */ - int computeL, /* Input - if 1, compute column L(:,vtx) - else compute row U(vtx, :) */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - psymbfact_stat_t *PS - ) -{ - int_t k, mem_error; - int_t kmin, kmax, ktemp, maxElt; - int_t sn_elt, sn_elt_prid; - int_t *globToLoc, maxNvtcsPProc; - int_t *xsub, *sub, *xsubPr, *subPr; - int_t *p_indsubPr, szsubPr; - - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - - if (computeL) { - xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub; - xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr; - p_indsubPr = &(Llu_symbfact->indLsubPr); - szsubPr = Llu_symbfact->szLsubPr; - } - else { - xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub; - xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr; - p_indsubPr = &(Llu_symbfact->indUsubPr); - szsubPr = Llu_symbfact->szUsubPr; - } - - kmin = xsub[snrep_lid]; - kmax = xsub_snp1 - 1; - if (prval_cursn != n) - maxElt = prval_cursn; - else - maxElt = EMPTY; - while (kmin <= kmax) { - if (prval_cursn == n) { - /* compute maximum element of L(:, vtx) */ - if (sub[kmin] > maxElt) - maxElt = sub[kmin]; - kmin ++; - } - else { - /* Do a quicksort-type partition. */ - if (sub[kmax] > prval_cursn) - kmax--; - else if (sub[kmin] <= prval_cursn) - kmin++; - else { /* kmin does'nt belong to G^s(L), and kmax belongs: - * interchange the two subscripts - */ - ktemp = sub[kmin]; - sub[kmin] = sub[kmax]; - sub[kmax] = ktemp; - kmin ++; - kmax --; - } - } - } - k = xsub[snrep_lid]; - while (sub[k] <= prval_cursn && k < xsub_snp1) { - sn_elt = sub[k]; - if (sn_elt < lstVtx_blk) { - sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset; - if ((*p_indsubPr) + 2 >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) - return (mem_error); - if (computeL) { - subPr = Llu_symbfact->lsubPr; szsubPr = Llu_symbfact->szLsubPr; - } - else { - subPr = Llu_symbfact->usubPr; szsubPr = Llu_symbfact->szUsubPr; - } - } - /* add krow to structure of pruned graph */ - subPr[(*p_indsubPr) + 1] = snrep_lid; - subPr[(*p_indsubPr)] = xsubPr[sn_elt_prid]; - xsubPr[sn_elt_prid] = (*p_indsubPr) + 1; - (*p_indsubPr) += 2; - } - if (sn_elt == maxElt) { - /* move prune val in the first position */ - sub[k] = sub[xsub[snrep_lid]]; - sub[xsub[snrep_lid]] = sn_elt; - } - k ++; - } - return SUCCES_RET; -} - -static int_t -blk_symbfact -(SuperMatrix *A, - int iam, - int lvl, - int szSep, - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t fstVtx_loc, /* Input - first vertex local of the level */ - int_t fstVtx_blk, - int_t lstVtx_blk, - int_t *lsub_rcvd, /* elements of node */ - int_t lsub_rcvd_sz, /* size of sub to be explored */ - int_t *usub_rcvd, - int_t usub_rcvd_sz, - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *marker, - int_t *p_mark, /* marker used to merge elements of vertices */ - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_neltsZr, /* no of artificial zeros introduced so far */ - int_t *p_neltsTotal, /* no of nonzeros (including artificials) - computed so far */ - int_t *p_nsuper_loc - ) -{ - int szSep_tmp, lvl_tmp, ii, jj; - int_t *xlsubPr, *xusubPr; - int_t *xsup, *supno, *lsub, *xlsub, *usub, *xusub; - int_t vtx_lid, vtx_prid, vtx, vtx_super, vtx_elt, maxNvtcsPProc; - int_t ind, pr, pr_elt, newnext, k, vtx_elt_lid; - int_t nextl, nextu, nsuper_loc, nvtcs, n, mem_error; - int_t x_aind_beg, x_aind_end, i, szLp, xlsub_snp1, xusub_snp1; - int_t snrep, snrep_lid, szsn, vtxp1, *globToLoc, domain_symb; - int_t lstVtx, neltsCurSep, maxNeltsVtx, fstVtx_loc_lid; - /* supernode relaxation parameters */ - int_t neltsVtx_L, neltsZrVtx_L, neltsMatched_L, neltsVtx_CSep_L; - int_t neltsVtx_U, neltsZrVtx_U, neltsMatched_U, neltsVtx_CSep_U; - int_t neltsZrSn_L, neltsZrSn_U, neltsZr, neltsTotal, - neltsZr_tmp, neltsTotal_tmp, neltsZrSn, neltsVtxInit_l, neltsVtxInit_u; - /* next vertex belongs to current supernode pruned structure */ - int_t vtx_bel_snL, vtx_bel_snU; - /* marker variables */ - int_t markl1_vtx, markl2_vtx, marku1_vtx, marku2_vtx; - /* prune structure variables */ - int_t prval_cursn, prval_curvtx, pr_offset; - /* variables for comms info */ - int_t neltSn_L, neltSn_U, lstVtx_tmp, stat; - float relax_param, relax_seps; - - if (fstVtx_blk >= lstVtx_blk) - return; - - /* Initializations */ - supno = Pslu_freeable->supno_loc; - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - xusubPr = Llu_symbfact->xusubPr; - xlsubPr = Llu_symbfact->xlsubPr; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - maxNeltsVtx = VInfo->maxNeltsVtx; - - n = A->ncol; - nextl = *p_nextl; - nextu = *p_nextu; - neltsZr = *p_neltsZr; - neltsTotal = *p_neltsTotal; - nsuper_loc = *p_nsuper_loc; - marku2_vtx = *p_mark; - lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2]; - - snrep = fstVtx_blk; - snrep_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - szsn = 1; - nvtcs = lstVtx_blk - fstVtx_blk; - prval_cursn = n; - vtx_bel_snL = EMPTY; vtx_bel_snU = EMPTY; - - /* set up to EMPTY xlsubPr[], xusubPr[] */ - if (PS->maxSzLPr < Llu_symbfact->indLsubPr) - PS->maxSzLPr = Llu_symbfact->indLsubPr; - if (PS->maxSzUPr < Llu_symbfact->indUsubPr) - PS->maxSzUPr = Llu_symbfact->indUsubPr; - for (i = 0; i < nvtcs; i++) { - xlsubPr[i] = EMPTY; - xusubPr[i] = EMPTY; - } - Llu_symbfact->indLsubPr = 0; - Llu_symbfact->indUsubPr = 0; - - if (ind_sizes1 == 0) - domain_symb = TRUE; - else { - domain_symb = FALSE; - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - } - - vtx_prid = 0; - vtx_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - pr_offset = vtx_lid; - - if (lsub_rcvd != NULL) { - updateRcvd_prGraph (n, iam, lsub_rcvd, lsub_rcvd_sz, - fstVtx_blk, lstVtx_blk, pr_offset, 1, marker, - Pslu_freeable, Llu_symbfact, VInfo, PS); - updateRcvd_prGraph (n, iam, usub_rcvd, usub_rcvd_sz, - fstVtx_blk, lstVtx_blk, pr_offset, 0, marker, - Pslu_freeable, Llu_symbfact, VInfo, PS); - } - - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++, vtx_prid ++) { - vtxp1 = vtx + 1; - if (marku2_vtx +4 >= n) { - /* reset to EMPTY marker array */ - for (i = 0; i < n; i++) - marker[i] = EMPTY; - marku2_vtx = EMPTY; - } - markl1_vtx = marku2_vtx + 1; markl2_vtx = markl1_vtx + 1; - marku1_vtx = markl2_vtx + 1; marku2_vtx = marku1_vtx + 1; - - prval_curvtx = n; - /* Compute nonzero structure L(:,vtx) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextl, - marker, - lsub_rcvd, lsub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l, - &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, - &neltsMatched_L, markl1_vtx, &prval_curvtx, - vtx_bel_snU, &vtx_bel_snL)) - return (mem_error); - lsub = Llu_symbfact->lsub; - -#ifdef TEST_SYMB - PrintInt10 ("L(:, %d)", nextl - xlsub[vtx_lid], &(lsub[xlsub[vtx_lid]])); -#endif - - /* Compute nonzero structure of U(vtx,:) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextu, - marker, - usub_rcvd, usub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u, - &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, - &neltsMatched_U, marku1_vtx, &prval_curvtx, - vtx_bel_snL, &vtx_bel_snU)) - return (mem_error); - usub = Llu_symbfact->usub; - -#ifdef TEST_SYMB - PrintInt10 ("U(%d, :)", nextu - xusub[vtx_lid], &(usub[xusub[vtx_lid]])); -#endif - - /* update statistics on fill-in */ - if (!domain_symb) { - stat = CEILING( (neltsVtxInit_l + neltsVtxInit_u), 2); - if (Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] != stat) { - stat = CEILING(stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid]); - PS->fill_pelt[0] += (float) stat; - if ((float) stat > PS->fill_pelt[1]) PS->fill_pelt[1] = (float) stat; - PS->fill_pelt[2] += 1.; - } - stat = CEILING( (neltsVtx_L + neltsVtx_U), 2); - stat = CEILING( stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] ); - PS->fill_pelt[3] += (float) stat; - if ((float) stat > PS->fill_pelt[4]) PS->fill_pelt[4] = (float) stat; - PS->fill_pelt[5] += 1.; - } - - /* compute number of artificial zeros */ - neltsTotal = 0; - neltsZr = 0; - neltsZrSn_L = neltsVtx_L - neltsMatched_L; - neltsZrSn_U = neltsVtx_U - neltsMatched_U; - neltsZrSn = neltsZrVtx_L + neltsZrVtx_U + - (neltsZrSn_L + neltsZrSn_U) * szsn; - neltsZr_tmp = neltsZr + neltsZrSn; - neltsTotal_tmp = neltsTotal + neltsZrSn + neltsVtx_L + neltsVtx_U; - if (neltsTotal_tmp == 0) - neltsTotal_tmp = 1; - relax_param = (float) (neltsTotal_tmp - neltsZr_tmp) / neltsTotal_tmp; - -#ifdef TEST_SYMB - printf ("[%d] vtx %d pr %d szsn %d nVtx_L %d nZrSn_L %d nZrVtx_L %d\n", - iam, vtx, prval_curvtx, szsn,neltsVtx_L, neltsZrSn_L, neltsZrVtx_L); - printf (" [%d] nVtx_U %d, nZrSn_U %d nZrVtx_U %d nextl %d nextu %d\n", - iam, neltsVtx_U, neltsZrSn_U, neltsZrVtx_U, nextl, nextu); - printf (" [%d] nZr %d nZr_tmp %d nTot %d nTot_tmp %d rel %f test %d\n\n", - iam, neltsZr, neltsZr_tmp, neltsTotal, neltsTotal_tmp, - relax_param, i); -#endif - - /* Check to see if vtx belongs in the same supernode as vtx-1 */ - supno[vtx_lid] = nsuper_loc; - if (vtx == fstVtx_blk) { - prval_cursn = prval_curvtx; - neltsTotal += neltsVtx_L + neltsVtx_U; - } - else { - if (maxNeltsVtx > 0) { - relax_seps = (float) neltsVtx_L / (float) maxNeltsVtx; - relax_seps *= (float) (neltsVtx_U+1) / (float) maxNeltsVtx; - } - else - relax_seps = 0.0; - - /* check if all upper separators are dense */ - if (relax_seps >= PS->relax_seps ) { - VInfo->filledSep = FILLED_SEPS; - *p_nextl = xlsub[vtx_lid]; - *p_nextu = xusub[vtx_lid]; - nsuper_loc += 1; - *p_nsuper_loc = nsuper_loc; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, vtx, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) - return (mem_error); - /* set up neltsZr and neltsTotal */ - vtx = lstVtx_blk; - return 0; - } /* if all upper separators are dense */ - else { - if (relax_param >= PS->relax_gen) { - /* vertex belongs to the same supernode */ - if (prval_cursn > prval_curvtx || prval_cursn <= vtx) - prval_cursn = prval_curvtx; - neltsZr = neltsZr_tmp; - neltsTotal = neltsTotal_tmp; - szsn ++; - /* add artificial zeros at the structure of current supernode */ - newnext = xlsub[snrep_lid+1]; - if (neltsZrSn_L != 0) { - for (k = xlsub[snrep_lid]; k < xlsub[snrep_lid+1]; k++) { - vtx_elt = lsub[k]; - if (vtx_elt >= vtx) - marker[vtx_elt] = markl2_vtx; - } - for (k = xlsub[vtx_lid]; k < nextl; k++) { - vtx_elt = lsub[k]; - if (marker[vtx_elt] != markl2_vtx) { - /* add vtx_elt to the structure of snrep */ - lsub[newnext] = vtx_elt; newnext ++; - marker[vtx_elt] = markl2_vtx; - } - } - xlsub[snrep_lid+1] = newnext; - } - xlsub[vtx_lid] = newnext; - nextl = newnext; - neltsVtx_L += neltsZrVtx_L; - - newnext = xusub[snrep_lid+1]; - if (neltsZrSn_U != 0) { - for (k = xusub[snrep_lid]; k < xusub[snrep_lid+1]; k++) { - vtx_elt = usub[k]; - if (vtx_elt >= vtx) { - if (marker[vtx_elt] == markl2_vtx) - if (prval_cursn > vtx_elt && vtx_elt != vtx) - prval_cursn = vtx_elt; - marker[vtx_elt] = marku2_vtx; - } - } - for (k = xusub[vtx_lid]; k < nextu; k++) { - vtx_elt = usub[k]; - if (marker[vtx_elt] != marku2_vtx) { - /* add vtx_elt to the structure of snrep */ - usub[newnext] = vtx_elt; newnext ++; - if (marker[vtx_elt] == markl2_vtx) - if (prval_cursn > vtx_elt && vtx_elt != vtx) - prval_cursn = vtx_elt; - marker[vtx_elt] = marku2_vtx; - } - } - if (marker[vtxp1] == marku2_vtx) - vtx_bel_snU = vtxp1; - xusub[snrep_lid+1] = newnext; - } - xusub[vtx_lid] = newnext; - nextu = newnext; - neltsVtx_U += neltsZrVtx_U; - } /* if ( relax_param >= PS->relax_param) */ - } /* if (VInfo->filledSep != FILLED_SEPS) */ - } /* if (vtx != fstVtx_blk) */ - - if ((relax_param < PS->relax_gen || vtx == lstVtx_blk-1) - && VInfo->filledSep != FILLED_SEPS) { - /* if a new supernode starts or is the last vertex */ - /* vtx starts a new supernode. Note we only store the - * subscript set of the first column of a supernode. */ - - if (marker[vtxp1] == marku1_vtx) - vtx_bel_snU = vtxp1; - /* build the pruned structure */ - if (relax_param < PS->relax_gen - && vtx == lstVtx_blk - 1 && vtx != fstVtx_blk) - szLp = 2; - else - szLp = 1; - if (vtx == fstVtx_blk) { - xlsub_snp1 = nextl; - xusub_snp1 = nextu; - } - else { - xlsub_snp1 = xlsub[snrep_lid+1]; - xusub_snp1 = xusub[snrep_lid+1]; - } - while (szLp > 0) { - szLp --; -#ifdef TEST_SYMB - printf ("End sn %d szsn %d\n", nsuper_loc, szsn); - printf ("BLD pr vtx %d snrep %d prval %d szLp %d\n", - vtx, snrep, prval_cursn, szLp); -#endif - - update_prGraph (iam, n, fstVtx_blk, lstVtx_blk, - snrep_lid, pr_offset, prval_cursn, - xlsub_snp1, 1, - Pslu_freeable, Llu_symbfact, PS); - update_prGraph (iam, n, fstVtx_blk, lstVtx_blk, - snrep_lid, pr_offset, prval_cursn, - xusub_snp1, 0, - Pslu_freeable, Llu_symbfact, PS); - -#ifdef TEST_SYMB - printf ("Adr lsub %p usub %p lsub %p pos %d usub %p pos %d\n", - &(lsub[xlsub[snrep_lid]]), &(usub[xusub[snrep_lid]]), - lsub, xlsub[snrep_lid], usub, xusub[snrep_lid]); - PrintInt10 ("Lsn", xlsub_snp1 - xlsub[snrep_lid], - &(lsub[xlsub[snrep_lid]])); - PrintInt10 ("Usn", xusub_snp1 - xusub[snrep_lid], - &(usub[xusub[snrep_lid]])); -#endif - - if (prval_cursn >= lstVtx_blk) { - neltSn_L = xlsub_snp1 - xlsub[snrep_lid]; - neltSn_U = xusub_snp1 - xusub[snrep_lid]; - if (ind_sizes1 != 0) { - CS->snd_intraSz += neltSn_L + neltSn_U + 4; - CS->snd_LintraSz += neltSn_L + 2; - } - if (prval_cursn >= lstVtx) { - /* this supernode will be send to next layers of the tree */ - lvl_tmp = lvl; - ii = ind_sizes1; - jj = ind_sizes2; - szSep_tmp = szSep; - lstVtx_tmp = lstVtx; - while (prval_cursn >= lstVtx_tmp && szSep_tmp != 1) { - jj = ii + szSep_tmp + (jj - ii) / 2; - ii += szSep_tmp; - lvl_tmp ++; - szSep_tmp = szSep_tmp / 2; - lstVtx_tmp = fstVtxSep[jj] + sizes[jj]; - CS->snd_interSz[lvl_tmp] += neltSn_L + neltSn_U + 4; - CS->snd_LinterSz[lvl_tmp] += neltSn_L + 2; - if (CS->snd_vtxinter[lvl_tmp] == EMPTY) - CS->snd_vtxinter[lvl_tmp] = snrep; - } - } - } - snrep = vtx; - snrep_lid = vtx_lid; - prval_cursn = prval_curvtx; - szsn = 1; - xlsub_snp1 = nextl; - xusub_snp1 = nextu; - } - if (relax_param < PS->relax_gen) { - neltsTotal += neltsVtx_L + neltsVtx_U; - nsuper_loc ++; - supno[vtx_lid] = nsuper_loc; - if (marker[vtxp1] == marku1_vtx) - vtx_bel_snU = vtxp1; - else - vtx_bel_snU = EMPTY; - } - } - if (vtx == lstVtx_blk - 1) - nsuper_loc ++; - - /* check if current separator is dense */ - if (!VInfo->filledSep) { - relax_seps = (float) neltsVtx_CSep_L / (float) (lstVtx - vtx); - relax_seps *= (float) (neltsVtx_CSep_U+1) / (float) (lstVtx - vtx); - if (relax_seps >= PS->relax_curSep ) - VInfo->filledSep = FILLED_SEP; - } - maxNeltsVtx --; - } - - *p_mark = marku2_vtx + 1; - *p_nextl = nextl; - *p_nextu = nextu; - *p_neltsZr = neltsZr; - *p_neltsTotal = neltsTotal; - *p_nsuper_loc = nsuper_loc; - - return 0; -} - -static void -domain_symbfact -(SuperMatrix *A, - int iam, /* Input - my processor number */ - int lvl, /* Input - current level in the separator tree */ - int szSep, /* Input - size of the current separator (node) */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *marker, - int_t *p_mark, /* marker used to merge elements of vertices */ - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_neltsZr, /* no of artificial zeros introduced so far */ - int_t *p_neltsTotal, /* no of nonzeros (including artificials) - computed so far */ - int_t *p_nsuper_loc - ) -{ - int_t lstVtx_lid, maxNvtcsPProc; - - /* call blk_symbfact */ - blk_symbfact (A, iam, lvl, - szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - EMPTY, fstVtx, lstVtx, - NULL, EMPTY, NULL, EMPTY, - Pslu_freeable, Llu_symbfact, VInfo, CS, PS, - marker, p_mark, - p_nextl, p_nextu, p_neltsZr, p_neltsTotal, - p_nsuper_loc); - - if (VInfo->filledSep != FILLED_SEPS) { - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - if (fstVtx >= lstVtx) - lstVtx_lid = 0; - else - lstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[lstVtx-1] ) + 1; - VInfo->xlsub_nextLvl = Llu_symbfact->xlsub[lstVtx_lid]; - Llu_symbfact->xlsub[lstVtx_lid] = *p_nextl; - VInfo->xusub_nextLvl = Llu_symbfact->xusub[lstVtx_lid]; - Llu_symbfact->xusub[lstVtx_lid] = *p_nextu; - } - VInfo->maxNeltsVtx -= lstVtx - fstVtx; -} - -/* - * Compute counts of rows/columns of current separator. - * cntelt_vtcs[i] is 0 when i is nonzero before current separator - * and n when i is zero before current separator. - * - * Set up nvtcsLvl_loc. - */ -static void -initLvl_symbfact -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS, - MPI_Comm ndComm, - int_t *marker, - int_t nextl, - int_t nextu - ) -{ - int_t *cntelt_vtcs, x_aind_beg, x_aind_end, x_aind_beg_l, x_aind_beg_u, - nelts_asup, nelts_ainf; - int_t nvtcsLvl_loc, fstVtx_loc, fstVtx_loc_lid, fstVtx_nextLvl; - int_t curblk_loc, nblks_loc, ind_blk; - int_t *lsub, *xlsub, *usub, *xusub; - int_t *begEndBlks_loc, code_err, mem_error; - int_t i, j, k, vtx, vtx_lid, fstVtx_blk, lstVtx_blk, vtx_elt, p, fill; - int_t nelts, nelts_fill_l, nelts_fill_u, nelts_cnts, maxNvtcsPProc, *globToLoc; - int_t use_fillcnts, cntelt_vtx_l, cntelt_vtx_u; - MPI_Status status; - - fill = PS->fill_par; - VInfo->filledSep = FALSE; - - /* Initializations */ - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - curblk_loc = VInfo->curblk_loc; - nblks_loc = VInfo->nblks_loc; - begEndBlks_loc = VInfo->begEndBlks_loc; - cntelt_vtcs = Llu_symbfact->cntelt_vtcs; - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - - /* compute nvtcsLvl_loc */ - nvtcsLvl_loc = 0; - ind_blk = curblk_loc; - while (fstVtx > begEndBlks_loc[ind_blk] && ind_blk < 2 * nblks_loc) { - ind_blk += 2; - } - curblk_loc = ind_blk; - fstVtx_loc = begEndBlks_loc[ind_blk]; - while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) { - nvtcsLvl_loc += begEndBlks_loc[ind_blk + 1] - - begEndBlks_loc[ind_blk]; - ind_blk += 2; - } - fstVtx_nextLvl = begEndBlks_loc[ind_blk]; - VInfo->nvtcsLvl_loc = nvtcsLvl_loc; - VInfo->curblk_loc = curblk_loc; - - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - vtx_lid = fstVtx_loc_lid; - x_aind_beg_l = VInfo->xlsub_nextLvl; - x_aind_beg_u = VInfo->xusub_nextLvl; - nelts_cnts = 0; - nelts_fill_l = 0; - nelts_fill_u = 0; - ind_blk = curblk_loc; - - while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) { - fstVtx_blk = begEndBlks_loc[ind_blk]; - lstVtx_blk = begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) - nelts_cnts += cntelt_vtcs[vtx_lid]; - nelts_fill_l += fill * (xlsub[vtx_lid] - x_aind_beg_l); - nelts_fill_u += fill * (xusub[vtx_lid] - x_aind_beg_u); - x_aind_beg_l = xlsub[vtx_lid]; - x_aind_beg_u = xusub[vtx_lid]; - } - - if (nvtcsLvl_loc != 0) { - nelts_ainf = xlsub[vtx_lid] - VInfo->xlsub_nextLvl; - nelts_asup = xusub[vtx_lid] - VInfo->xusub_nextLvl; - } - else { - nelts_ainf = 0; - nelts_asup = 0; - } - - use_fillcnts = FALSE; - if (nextl + nelts_cnts >= Llu_symbfact->szLsub - nelts_ainf || - nextu + nelts_cnts >= Llu_symbfact->szUsub - nelts_asup) { - use_fillcnts = TRUE; - } - - use_fillcnts = TRUE; - - if (use_fillcnts) { - if (nextl + nelts_fill_l >= Llu_symbfact->szLsub - nelts_ainf) - mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx, nextl, - nextl + nelts_fill_l, LSUB, - RL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - if (nextu + nelts_fill_u >= Llu_symbfact->szUsub - nelts_asup) - mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx, nextu, - nextu + nelts_fill_u, USUB, - RL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - - /* init xlsub[fstVtx:lstVtx] and xusub[fstVtx:lstVtx] and - copy elements of A[fstVtx:lstVtx, fstVtx:lstVtx] in lsub and usub */ - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - x_aind_beg_l = VInfo->xlsub_nextLvl; - x_aind_beg_u = VInfo->xusub_nextLvl; - vtx_lid = fstVtx_loc_lid; - ind_blk = curblk_loc; - - while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) { - fstVtx_blk = begEndBlks_loc[ind_blk]; - lstVtx_blk = begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) { - if (vtx_lid != fstVtx_loc_lid) { - x_aind_beg_l = xlsub[vtx_lid]; - x_aind_beg_u = xusub[vtx_lid]; - } - if (use_fillcnts) { - cntelt_vtx_l = fill * (xlsub[vtx_lid+1] - x_aind_beg_l); - cntelt_vtx_u = fill * (xusub[vtx_lid+1] - x_aind_beg_u); - } - else { - cntelt_vtx_l = cntelt_vtcs[vtx_lid]; - cntelt_vtx_u = cntelt_vtcs[vtx_lid]; - } - x_aind_end = xlsub[vtx_lid + 1]; - Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] = - CEILING( (xlsub[vtx_lid+1]-x_aind_beg_l + xusub[vtx_lid+1]-x_aind_beg_u), 2); - - xlsub[vtx_lid] = nextl; - nelts = 0; - for (k = x_aind_beg_l; k < x_aind_end; k++) { - lsub[nextl] = lsub[k]; nextl ++; - nelts ++; - } - if (nelts < cntelt_vtx_l) - lsub[nextl] = EMPTY; - nextl += cntelt_vtx_l - nelts; - x_aind_end = xusub[vtx_lid + 1]; - xusub[vtx_lid] = nextu; - nelts = 0; - for (k = x_aind_beg_u; k < x_aind_end; k++) { - usub[nextu] = usub[k]; nextu ++; - nelts ++; - } - if (nelts < cntelt_vtx_u) - usub[nextu] = EMPTY; - nextu += cntelt_vtx_u - nelts; - } - } - - if (nvtcsLvl_loc == 0) { - if (curblk_loc == 0) - vtx_lid = 0; - else { - if (begEndBlks_loc[curblk_loc-1] == 0) - vtx_lid = 0; - else - vtx_lid = LOCAL_IND( globToLoc[begEndBlks_loc[curblk_loc-1] - 1] ) + 1; - } - - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - } - else { - VInfo->xlsub_nextLvl = xlsub[vtx_lid]; - xlsub[vtx_lid] = nextl; - VInfo->xusub_nextLvl = xusub[vtx_lid]; - xusub[vtx_lid] = nextu; - if (PS->estimLSz < nextl) - PS->estimLSz = nextl; - if (PS->estimUSz < nextu) - PS->estimUSz = nextu; - - VInfo->nnz_ainf_loc -= nelts_ainf; - VInfo->nnz_asup_loc -= nelts_asup; - } - VInfo->fstVtx_nextLvl = fstVtx_nextLvl; -} - - -static int_t -expand_RL -( - int_t computeRcvd, /* if = 1, then update from receive buffer, - else update from own data */ - int_t n, - int iam, /* process number */ - int_t *lsub_rcvd, /* elements of node */ - int_t lsub_rcvd_sz, /* size of sub to be explored */ - int_t *usub_rcvd, - int_t usub_rcvd_sz, - int_t vtxXp, - int_t vtx_upd_pr, /* ind in pruned structure of upd vertex which - doesn't fit into the alloc memory */ - int_t lstVtx_upd_pr, /* ind in pruned structure of lst vtx to update */ - int_t fstVtx_srcUpd, /* first vertex source of the updates */ - int_t lstVtx_srcUpd, /* last vertex source of the updates */ - int_t fstVtx_toUpd, /* first vertex to update */ - int_t lstVtx_toUpd, /* last vertex to update */ - int_t nvtcs_toUpd, /* no of vertices to update */ - int computeL, - int_t *pmarkl, - int_t *marker, - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS - ) -{ - int_t fstVtx_toUpd_lid, vtx_lid, vtx, vtx_elt, vtx_elt_lid, nextl, nelts_in; - int_t i, ii, j, nelts, nelts_vtx, mpnelts, lvtx_lid, elt, vtxXp_lid; - int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub; - int_t markl, *globToLoc, maxNvtcsPProc; - int_t mem_error, len_texp; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - - xusubPr = Llu_symbfact->xlsubPr; usubPr = Llu_symbfact->lsubPr; - if (computeL) { - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - } - else { - xlsub = Llu_symbfact->xusub; lsub = Llu_symbfact->usub; - xusub = Llu_symbfact->xlsub; usub = Llu_symbfact->lsub; - } - markl = *pmarkl + 1; - fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] ); - vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] ); - nextl = xlsub[vtxXp_lid+1]; - - lvtx_lid = EMPTY; - if (lstVtx_srcUpd != EMPTY) - lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] ); - - /* count the number of new elements, and update Llu_symbfact->cntelt_vtcs */ - vtx_lid = fstVtx_toUpd_lid; - vtx_lid += vtx_upd_pr; - len_texp = 0; - for (i = vtx_upd_pr; i < lstVtx_upd_pr; i++, vtx_lid ++) { - nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - if (xusubPr[i] != xusubPr[i+1]) { - j = xusubPr[i]; - vtx = usubPr[j]; - /* setup marker structure for already existing elements */ - ii = xlsub[vtx_lid]; - while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) { - marker[lsub[ii]] = markl; - ii ++; - } - nelts_vtx = ii - xlsub[vtx_lid]; - for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) { - vtx_elt = usubPr[j]; - ii = marker[vtx_elt]; - if (computeRcvd) { - nelts = lsub_rcvd[ii + NELTS_IND]; - ii += RCVD_IND; - mpnelts = marker[vtx_elt] + nelts + RCVD_IND; - } - else { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ); - if (vtx_elt_lid == lvtx_lid) - nelts = lsub_rcvd_sz - ii; - else - nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid]; - mpnelts = marker[vtx_elt] + nelts; - } - - if (!computeL) - marker[vtx] = markl; - for (ii; ii < mpnelts; ii++) { - elt = lsub_rcvd[ii]; - if (elt >= vtx) { - if (marker[elt] != markl) { - /* add elt to structure of vtx */ - marker[elt] = markl; - nelts_vtx ++; - } - } - } - } - if (nelts_vtx != 0 && (nelts_vtx > xlsub[vtx_lid+1] - xlsub[vtx_lid])) { - nelts_in = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - if (nelts_in == 0) nelts_in = 1; - j = nelts_vtx / nelts_in; - if (nelts_vtx % nelts_in != 0) j++; - nelts_vtx = j * nelts_in; - } - else - nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - markl ++; - if (markl == n) { - /* reset marker array */ - for (j = fstVtx_toUpd; j < n; j++) - marker[j] = EMPTY; - markl = 0; - } - } - Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx; - len_texp += nelts_vtx; - } - for (; i < nvtcs_toUpd; i++, vtx_lid++) { - nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx; - len_texp += nelts_vtx; - } - - *pmarkl = markl; - /* mark elements array */ - for (i = xlsub[vtxXp_lid]; i < nextl; i++) { - marker[lsub[i]] = markl; - } - - nextl = xlsub[vtxXp_lid+1]; - if (mem_error = - psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, - computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - - return 0; -} - - -static int_t -rl_update -( - int computeRcvd, /* if = 1, then update from receive buffer, - else update from own data */ - int_t n, - int iam, /* process number */ - int_t *lsub_rcvd, /* elements of node */ - int_t lsub_rcvd_sz, /* size of sub to be explored */ - int_t *usub_rcvd, - int_t usub_rcvd_sz, - int_t fstVtx_srcUpd, /* first vertex source of the updates */ - int_t lstVtx_srcUpd, /* last vertex source of the updates */ - int_t indBlk_srcUpd, /* block index of first vertex */ - int_t fstVtx_toUpd, /* first vertex to update */ - int_t lstVtx_toUpd, /* last vertex to update */ - int_t nvtcs_toUpd, /* no of vertices to update */ - int computeL, - int_t *pmarkl, - int_t *marker, - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS - /* marker: first elements of marker contain the nodes that will - be used in the updates */ - ) -{ - int_t i, j, k, prVal, nelts, ind, nextl, ii, mpnelts, mem_error; - int_t vtx, vtx_lid, vtx_elt, vtx_elt_lid, lvtx_lid; - int_t fstVtx_toUpd_lid, markl, elt, vtx_loc, ind_blk; - int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub; - int_t fstVtx_upd, lstVtx_upd, maxNvtcsPProc, *globToLoc; - int_t fstVtx_srcUpd_lid, nelts_vtx, expand; - - /* quick return */ - if (fstVtx_toUpd >= lstVtx_toUpd) - return; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - - fstVtx_upd = EMPTY; - lstVtx_upd = EMPTY; - xusubPr = Llu_symbfact->xlsubPr; usubPr = Llu_symbfact->lsubPr; - if (computeL) { - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - } - else { - xlsub = Llu_symbfact->xusub; lsub = Llu_symbfact->usub; - xusub = Llu_symbfact->xlsub; usub = Llu_symbfact->lsub; - } - markl = *pmarkl; - fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] ); - - /* count number of elements in transpose representation of usub_rcvd */ - /* use marker to count those elements */ - for (i = 0; i < nvtcs_toUpd; i++) - marker[i] = 0; - - i = 0; - if (fstVtx_srcUpd != EMPTY) { - fstVtx_srcUpd_lid = LOCAL_IND( globToLoc[fstVtx_srcUpd] ); - vtx_lid = fstVtx_srcUpd_lid; - } - lvtx_lid = EMPTY; - if (lstVtx_srcUpd != EMPTY) - lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] ); - - while (i < usub_rcvd_sz) { - if (computeRcvd) { - vtx = usub_rcvd[i + DIAG_IND]; - nelts = usub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - } - else { - if (vtx_lid == lvtx_lid) - nelts = usub_rcvd_sz - i; - else - nelts = xusub[vtx_lid + 1] - xusub[vtx_lid]; - vtx_lid ++; - } - prVal = usub_rcvd[i]; - for (k = i; k < i + nelts; k++) { - vtx_elt = usub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt] ) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - - fstVtx_toUpd_lid; - marker[vtx_elt_lid] ++; - } - } - } - } - i += nelts; - } - - ind = 0; - for (i = 0; i < nvtcs_toUpd; i++) { - if (marker[i] != 0) { - marker[i] ++; - if (fstVtx_upd == EMPTY) - fstVtx_upd = i; - lstVtx_upd = i; - } - xusubPr[i] = ind; - ind += marker[i]; - marker[i] = xusubPr[i]; - } - xusubPr[i] = ind; - lstVtx_upd ++; - - if (ind == 0) - /* quick return if no update */ - return; - - /* test if enough memory in usubPr array */ - if (ind > Llu_symbfact->szLsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) - return (mem_error); - usubPr = Llu_symbfact->lsubPr; - } - - i = 0; - if (fstVtx_srcUpd != EMPTY) { - vtx_loc = fstVtx_srcUpd; - vtx_lid = LOCAL_IND( globToLoc[vtx_loc] ); - ind_blk = indBlk_srcUpd; - } - while (i < usub_rcvd_sz) { - if (computeRcvd) { - vtx = usub_rcvd[i + DIAG_IND]; - nelts = usub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - } - else { - vtx = vtx_loc; - if (vtx_lid == lvtx_lid) - nelts = usub_rcvd_sz - i; - else - nelts = xusub[vtx_lid + 1] - xusub[vtx_lid]; - vtx_lid ++; - vtx_loc ++; - if (ind_blk != EMPTY) - if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) { - ind_blk += 2; - vtx_loc = VInfo->begEndBlks_loc[ind_blk]; - } - } - - prVal = usub_rcvd[i]; - for (k = i; k < i + nelts; k++) { - vtx_elt = usub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt]) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_toUpd_lid; - /* add vtx_elt to the pruned structure */ - if (marker[vtx_elt_lid] == xusubPr[vtx_elt_lid]) { - usubPr[marker[vtx_elt_lid]] = vtx_elt; - marker[vtx_elt_lid] ++; - } - usubPr[marker[vtx_elt_lid]] = vtx; - marker[vtx_elt_lid] ++; - } - } - } - } - i += nelts; - } - /* reset marker array */ - for (i = 0; i < nvtcs_toUpd; i++) - marker[i] = EMPTY; - if (fstVtx_srcUpd != EMPTY) { - vtx_loc = fstVtx_srcUpd; - vtx_lid = LOCAL_IND( globToLoc[vtx_loc] ); - ind_blk = indBlk_srcUpd; - } - i = 0; - while (i < lsub_rcvd_sz) { - if (computeRcvd) { - vtx = lsub_rcvd[i + DIAG_IND]; - nelts = lsub_rcvd[i + NELTS_IND]; - marker[vtx] = i; - i += RCVD_IND; - } - else { - vtx = vtx_loc; - if (vtx_lid == lvtx_lid) - nelts = lsub_rcvd_sz - i; - else - nelts = xlsub[vtx_lid + 1] - xlsub[vtx_lid]; - vtx_lid ++; - marker[vtx] = i; - vtx_loc ++; - if (ind_blk != EMPTY) - if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) { - ind_blk += 2; - vtx_loc = VInfo->begEndBlks_loc[ind_blk]; - } - } - i += nelts; - } - - /* use the pruned structure to update symbolic factorization */ - vtx_lid = fstVtx_toUpd_lid; - vtx_lid += fstVtx_upd; - for (i = fstVtx_upd; i < lstVtx_upd; i++, vtx_lid ++) { - if (xusubPr[i] != xusubPr[i+1]) { - j = xusubPr[i]; - vtx = usubPr[j]; - /* setup marker structure for already existing elements */ - ii = xlsub[vtx_lid]; - while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) { - marker[lsub[ii]] = markl; - ii ++; - } - PS->nops += ii - xlsub[vtx_lid]; - nextl = ii; - for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) { - vtx_elt = usubPr[j]; - ii = marker[vtx_elt]; - if (computeRcvd) { - nelts = lsub_rcvd[ii + NELTS_IND]; - ii += RCVD_IND; - mpnelts = marker[vtx_elt] + nelts + RCVD_IND; - } - else { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ); - if (vtx_elt_lid == lvtx_lid) - nelts = lsub_rcvd_sz - ii; - else - nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid]; - mpnelts = marker[vtx_elt] + nelts; - } - - if (!computeL) - marker[vtx] = markl; - PS->nops += mpnelts - ii; - for (ii; ii < mpnelts; ii++) { - elt = lsub_rcvd[ii]; - if (elt >= vtx) { - if (marker[elt] != markl) { - /* add elt to structure of vtx */ - if (nextl >= xlsub[vtx_lid + 1]) { - if (mem_error = - expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, vtx, i, - lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd, - fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL, - &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - if (computeL) { - lsub = Llu_symbfact->lsub; - if (!computeRcvd) - lsub_rcvd = - &(Llu_symbfact->lsub[Llu_symbfact->xlsub[fstVtx_srcUpd_lid]]); - } else { - marker[vtx] = markl; - lsub = Llu_symbfact->usub; - if (!computeRcvd) - lsub_rcvd = - &(Llu_symbfact->usub[Llu_symbfact->xusub[fstVtx_srcUpd_lid]]); - } - } - lsub[nextl] = elt; nextl ++; - marker[elt] = markl; - } - } - } - } - if (nextl < xlsub[vtx_lid+1]) - lsub[nextl] = EMPTY; - markl ++; - if (markl == n) { - /* reset marker array */ - for (j = fstVtx_toUpd; j < n; j++) - marker[j] = EMPTY; - markl = 0; - } - } - } - *pmarkl = markl; - - return 0; -} - -static int_t -dnsUpSeps_symbfact -( - int_t n, - int iam, /* my processor number */ - int szSep, - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t fstVtx_dns, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_nsuper_loc - ) -{ - int_t nextl, nextu, nsuper_loc, curblk_loc, mem_error; - int_t vtx_elt, ind_blk, vtx, k; - int_t *xlsub, *xusub, *lsub, *usub; - int_t fstVtx_blk, fstVtx_blk_lid, vtx_lid, lstVtx_blk, fstVtx_lvl, lstVtx_lvl; - int_t *globToLoc, maxNvtcsPProc; - - /* Initialization */ - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - nextl = *p_nextl; - nextu = *p_nextu; - nsuper_loc = *p_nsuper_loc; - curblk_loc = VInfo->curblk_loc; - VInfo->nnz_ainf_loc = 0; - VInfo->nnz_asup_loc = 0; - - if (fstVtx_dns == EMPTY) - fstVtx_blk = VInfo->begEndBlks_loc[curblk_loc]; - else - fstVtx_blk = fstVtx_dns; - if (fstVtx_blk == n) - return 0; - fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - vtx_lid = fstVtx_blk_lid; - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - PS->nDnsUpSeps = 0; - - while (szSep >= 1) { - PS->nDnsUpSeps++; - fstVtx_lvl = fstVtxSep[ind_sizes2]; - lstVtx_lvl = fstVtxSep[ind_sizes2] + sizes[ind_sizes2]; - if (fstVtx_blk > fstVtx_lvl) - vtx_elt = fstVtx_blk; - else - vtx_elt = fstVtx_lvl; - if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, - nextl + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - lsub = Llu_symbfact->lsub; - } - if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, - nextu + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - usub = Llu_symbfact->usub; - } - PS->nops += 2 * (lstVtx_lvl - vtx_elt); - for (; vtx_elt < lstVtx_lvl; vtx_elt++) { - lsub[nextl] = vtx_elt; nextl++; - usub[nextu] = vtx_elt; nextu++; - } - ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2; - ind_sizes1 += szSep; - szSep = szSep / 2; - } - /* delete the diagonal element from the U structure */ - usub[xusub[fstVtx_blk_lid]] = usub[nextu - 1]; - nextu --; - xlsub[fstVtx_blk_lid+1] = nextl; - xusub[fstVtx_blk_lid+1] = nextu; - - vtx_lid = fstVtx_blk_lid; - ind_blk = curblk_loc; - while (ind_blk < 2 * VInfo->nblks_loc) { - if (ind_blk != curblk_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - - for (k = xlsub[fstVtx_blk_lid]; k < xlsub[fstVtx_blk_lid+1]; k++) - if (lsub[k] >= fstVtx_blk) { - lsub[nextl] = lsub[k]; nextl ++; - if (nextl >= MEM_LSUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - lsub = Llu_symbfact->lsub; - } - for (k = xusub[fstVtx_blk_lid]; k < xusub[fstVtx_blk_lid+1]; k++) - if (usub[k] > fstVtx_blk) { - usub[nextu] = usub[k]; nextu ++; - if (nextu >= MEM_USUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0, - USUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - usub = Llu_symbfact->usub; - } - PS->nops += xlsub[fstVtx_blk_lid+1] - xlsub[fstVtx_blk_lid]; - PS->nops += xusub[fstVtx_blk_lid+1] - xusub[fstVtx_blk_lid]; - } - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - Pslu_freeable->supno_loc[vtx_lid] = nsuper_loc; - if (vtx > fstVtx_blk) { - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - } - } - ind_blk += 2; - nsuper_loc ++; - } - - *p_nextl = nextl; - *p_nextu = nextu; - *p_nsuper_loc = nsuper_loc; -/* VInfo->curblk_loc = ind_blk; */ - - return 0; -} - -static int_t -dnsCurSep_symbfact -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int szSep, - int npNode, - int_t rcvd_dnsSep, - int_t *p_nextl, - int_t *p_nextu, - int_t *p_mark, - int_t *p_nsuper_loc, - int_t *marker, /* temporary array of size n */ - MPI_Comm ndCom, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS - ) -{ - int_t fstVtx_blk, fstVtx_dns, fstVtx_dns_lid, lstVtx_blk, - fstVtx, lstVtx, lstVtx_dns_lid; - int_t ind_blk, i, vtx, vtx_lid, vtx_lid_x, nvtcs_upd, save_cnt, mem_error; - int_t computeL, computeU, vtx_elt, j, cur_blk, snlid, snrep; - int_t *sub, *xsub, *minElt_vtx, *cntelt_vtcs; - int_t mark, next, *x_newelts, *x_newelts_L, *x_newelts_U; - int_t *newelts_L, *newelts_U, *newelts; - int_t *globToLoc, maxNvtcsPProc, lvl; - int_t prval, kmin, kmax, maxElt, ktemp, prpos; - float mem_dnsCS; - - if (!rcvd_dnsSep) - VInfo->curblk_loc += 2; - - computeL = TRUE; computeU = TRUE; - lstVtx_dns_lid = EMPTY; - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx = fstVtxSep[ind_sizes2]; - lstVtx = fstVtx + sizes[ind_sizes2]; - cur_blk = VInfo->curblk_loc; - fstVtx_dns = VInfo->begEndBlks_loc[cur_blk]; - fstVtx_dns_lid = LOCAL_IND( globToLoc[fstVtx_dns] ); - lvl = (int_t) LOG2( npNode ); - x_newelts_U = NULL; - newelts_L = NULL; - newelts_U = NULL; - mem_dnsCS = 0.; - - PS->nDnsCurSep ++; - - if (CS->rcv_bufSz > n - fstVtx_dns) - minElt_vtx = CS->rcv_buf; - else { - if (!(minElt_vtx = intMalloc_symbfact(n - fstVtx_dns))) - ABORT("Malloc fails for minElt_vtx[]."); - mem_dnsCS += n - fstVtx_dns; - } - - while (computeL || computeU) { - if (computeL) { - sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub; - x_newelts = Llu_symbfact->cntelt_vtcs; - x_newelts_L = x_newelts; - } - else { - sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub; - } - - /* use minElt_vtx to determine starting vertex of each nonzero element */ - for (i = 0; i < n - fstVtx_dns; i++) - minElt_vtx[i] = n; - - ind_blk = cur_blk; - vtx_lid = fstVtx_dns_lid; - nvtcs_upd = 0; - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - nvtcs_upd += lstVtx_blk - fstVtx_blk; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - j = xsub[vtx_lid]; - while (j < xsub[vtx_lid+1] && sub[j] != EMPTY) { - PS->nops ++; - vtx_elt = sub[j] - fstVtx_dns; - if (minElt_vtx[vtx_elt] == n) { - minElt_vtx[vtx_elt] = vtx; - } - j ++; - } - } - } - if (!computeL) { - if (!(x_newelts_U = intMalloc_symbfact(nvtcs_upd + 1))) - ABORT("Malloc fails for x_newelts_U[]."); - mem_dnsCS += nvtcs_upd + 1; - x_newelts = x_newelts_U; - } - else { - /* save the value in cntelt_vtcs[lstVtx_blk_lid] */ - save_cnt = x_newelts[vtx_lid]; - lstVtx_dns_lid = vtx_lid; - } - - MPI_Allreduce (&(minElt_vtx[lstVtx - fstVtx_dns]), &(marker[lstVtx]), - n - lstVtx, mpi_int_t, MPI_MIN, ndCom); - -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) LOG2( npNode )); - PS->sz_msgsCol += (float) (n - lstVtx); - if (PS->maxsz_msgCol < n - lstVtx) - PS->maxsz_msgCol = n - lstVtx; -#endif - - /* use x_newelts to determine counts of elements starting in each vertex */ - for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) - x_newelts[vtx_lid] = 0; - - for (vtx = lstVtx; vtx < n; vtx++) { - if (marker[vtx] != n) { - vtx_elt = marker[vtx]; - if (OWNER( globToLoc[vtx_elt] ) == iam) { - x_newelts[ LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid ] ++; - } - else { - /* find the first vertex > vtx_elt which belongs to iam */ - ind_blk = cur_blk; - vtx_lid = 0; - while (vtx_elt > VInfo->begEndBlks_loc[ind_blk] && - ind_blk < 2 * VInfo->nblks_loc) { - vtx_lid += VInfo->begEndBlks_loc[ind_blk+1] - - VInfo->begEndBlks_loc[ind_blk]; - ind_blk += 2; - } - if (VInfo->begEndBlks_loc[ind_blk] < lstVtx) { - x_newelts[vtx_lid] ++; - marker[vtx] = VInfo->begEndBlks_loc[ind_blk]; - } - else - marker[vtx] = n; - } - } - } - - /* set up beginning of new elements for each local vtx */ - i = 0; - for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) { - j = x_newelts[vtx_lid]; - x_newelts[vtx_lid] = i; - i += j; - } - x_newelts[vtx_lid] = i; - newelts = NULL; - if (i != 0) { - if (!(newelts = intMalloc_symbfact(x_newelts[vtx_lid]))) - ABORT("Malloc fails for newelts[]."); - mem_dnsCS += x_newelts[vtx_lid]; - - for (vtx = lstVtx; vtx < n; vtx++) { - if (marker[vtx] != n) { - vtx_elt = marker[vtx]; - vtx_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid; - newelts[x_newelts[vtx_lid]] = vtx; - x_newelts[vtx_lid] ++; - } - } - } - /* reset beginning of new elements for each local vertex */ - i = 0; - for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) { - j = x_newelts[vtx_lid]; - x_newelts[vtx_lid] = i; - i = j; - } - - if (computeL == TRUE) { - computeL = FALSE; - newelts_L = newelts; - } - else { - computeU = FALSE; - newelts_U = newelts; - } - } - - for (i = fstVtx_dns; i < n; i++) - marker[i] = EMPTY; - mark = 0; - - /* update vertices */ - prval = n; - ind_blk = cur_blk; - fstVtx_dns = VInfo->begEndBlks_loc[ind_blk]; - vtx_lid = LOCAL_IND( globToLoc[fstVtx_dns] ); - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - vtx_lid_x = vtx_lid - fstVtx_dns_lid; - Llu_symbfact->xlsub[vtx_lid] = *p_nextl; - Llu_symbfact->xusub[vtx_lid] = *p_nextu; - if (vtx == fstVtx_blk || x_newelts_L[vtx_lid_x+1] != x_newelts_L[vtx_lid_x] || - x_newelts_U[vtx_lid_x+1] != x_newelts_U[vtx_lid_x]) { - /* a new supernode starts */ - snlid = vtx_lid; - snrep = vtx; - if (mark + 2 > n) { - /* reset to EMPTY marker array */ - for (i = 0; i < n; i++) - marker[i] = EMPTY; - mark = 0; - } - - computeL = TRUE; - computeU = FALSE; - while (computeL || computeU) { - if (computeL) { - sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub; - x_newelts = x_newelts_L; newelts = newelts_L; - next = *p_nextl; - } - else { - sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub; - x_newelts = x_newelts_U; newelts = newelts_U; - next = *p_nextu; - } - xsub[vtx_lid] = next; - - /* TEST available memory */ - j = x_newelts[vtx_lid_x+1] + lstVtx - vtx; - if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) || - (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, next + j, - computeL, DNS_CURSEP, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - if (computeL) sub = Llu_symbfact->lsub; - else sub = Llu_symbfact->usub; - } - - if (computeL) i = vtx; - else i = vtx+1; - while (i < lstVtx) { - sub[next] = i; next ++; - i ++; - } - PS->nops += x_newelts[vtx_lid_x+1]; - for (i = 0; i < x_newelts[vtx_lid_x+1]; i++) { - vtx_elt = newelts[i]; - sub[next] = vtx_elt; next ++; - if (computeU && vtx_elt < prval - && marker[vtx_elt] == mark-1) - prval = vtx_elt; - marker[vtx_elt] = mark; - } - if (computeL) { - computeL = FALSE; computeU = TRUE; - *p_nextl = next; - } - else { - computeU = FALSE; - *p_nextu = next; - } - mark ++; - } - if (vtx != fstVtx_blk) - (*p_nsuper_loc) ++; - } /* a new supernode starts */ - /* vtx belongs to the curent supernode */ - Pslu_freeable->supno_loc[vtx_lid] = *p_nsuper_loc; - } - (*p_nsuper_loc) ++; - } - - if (ind_blk > 0) { - /* if iam owns blocks of this level */ - i = *p_nextl - Llu_symbfact->xlsub[snlid]; - j = *p_nextu - Llu_symbfact->xusub[snlid]; - - if (VInfo->begEndBlks_loc[ind_blk - 1] == lstVtx && i > 1 && j > 0) { - /* if iam the last processor owning a block of this level */ - computeL = TRUE; computeU = FALSE; - /* prune the structure */ - while (computeL || computeU) { - if (computeL) { - sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub; - next = *p_nextl; - computeL = FALSE; computeU = TRUE; - } - else { - sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub; - next = *p_nextu; - computeU = FALSE; - } - - kmin = xsub[snlid]; - kmax = next - 1; - if (prval != n) { - maxElt = prval; - while (kmin <= kmax) { - /* Do a quicksort-type partition. */ - if (sub[kmax] > prval) - kmax--; - else if (sub[kmin] <= prval) { - kmin++; - } - else { /* kmin does'nt belong to G^s(L), and kmax belongs: - * interchange the two subscripts - */ - ktemp = sub[kmin]; - sub[kmin] = sub[kmax]; - sub[kmax] = ktemp; - kmin ++; - kmax --; - } - if (sub[kmin-1] == prval) - prpos = kmin - 1; - } - } - else { - maxElt = EMPTY; - while (kmin <= kmax) { - /* compute maximum element of L(:, vtx) */ - if (sub[kmin] > maxElt) { - maxElt = sub[kmin]; - prpos = kmin; - } - kmin ++; - } - } - ktemp = sub[xsub[snlid]]; - sub[xsub[snlid]] = maxElt; - sub[prpos] = ktemp; - } - - /* setup snd_interSz information */ - prval = Llu_symbfact->lsub[Llu_symbfact->xlsub[snlid]]; - if (prval >= lstVtx) { - /* this supernode will be send to next layers of the tree */ - while (prval >= lstVtx && szSep != 1) { - ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2; - ind_sizes1 += szSep; - lvl ++; - szSep = szSep / 2; - lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2]; - CS->snd_interSz[lvl] += i + j + 4; - CS->snd_LinterSz[lvl] += i + 2; - if (CS->snd_vtxinter[lvl] == EMPTY) - CS->snd_vtxinter[lvl] = snrep; - } - } - } - } - - /* restore value in cntelt_vtcs */ - if (lstVtx_dns_lid != EMPTY) - Llu_symbfact->cntelt_vtcs[lstVtx_dns_lid] = save_cnt; - *p_mark = mark; - if (minElt_vtx != CS->rcv_buf) - SUPERLU_FREE (minElt_vtx); - SUPERLU_FREE (x_newelts_U); - if (newelts_L) SUPERLU_FREE (newelts_L); - if (newelts_U) SUPERLU_FREE (newelts_U); - if (PS->szDnsSep < mem_dnsCS) - PS->szDnsSep = mem_dnsCS; -} - -/* all processors affected to current node must call this routine - when VInfo->filledSep == FILLED_SEP - This is necessary since subsequent routines called from here use - MPI_allreduce among all processors affected to curent node */ - -static int_t -denseSep_symbfact -( - int rcvd_dnsSep, /* =1 if processor received info that the separator - became dense, - =0 if myPE determined that separator is full */ - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each separator in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int szSep, - int fstP, /* first pe affected current node */ - int lstP, /* last pe affected current node */ - int_t fstVtx_blkCyc, - int_t nblk_loc, /* block number in the block cyclic distribution of current - supernode */ - int_t *p_nextl, - int_t *p_nextu, - int_t *p_mark, - int_t *p_nsuper_loc, - int_t *marker, - MPI_Comm ndCom, - MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS -) -{ - int nprocsLvl, p, prvP, tag; - int_t nmsgsToSnd, nmsgsToRcv; - int_t ind_blk, mem_error; - int_t *rcv_intraLvl; - int_t fstVtx, lstVtx, cur_blk, lstVtx_blk, fstVtx_blk; - int_t *globToLoc, maxNvtcsPProc; - MPI_Status status; - - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx = fstVtxSep[ind_sizes2]; - lstVtx = fstVtx + sizes[ind_sizes2]; - rcv_intraLvl = CS->rcv_intraLvl; - cur_blk = VInfo->curblk_loc; - nprocsLvl = lstP - fstP; - - if (nblk_loc == 0) { - nmsgsToSnd = 2; nmsgsToRcv = 1; - } - else { - nmsgsToSnd = 1; nmsgsToRcv = 0; - if (!rcvd_dnsSep) nmsgsToRcv ++; - } - if (iam == fstP && rcvd_dnsSep && nblk_loc == 1) - nmsgsToRcv ++; - - /* first exchange msgs with all processors affected to current node */ - ind_blk = cur_blk; - while ((nmsgsToSnd || nmsgsToRcv) && VInfo->begEndBlks_loc[ind_blk] < lstVtx) { - tag = (int) (tag_intraLvl + nblk_loc); - if (nmsgsToSnd) { - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - if (lstVtx_blk != lstVtx) { - p = OWNER( globToLoc[lstVtx_blk]); - MPI_Send (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p, - tag, (*symb_comm)); -#if ( PRNTlevel>=1 ) - PS->no_shmSnd += (float) 1; -#endif - } - nmsgsToSnd --; - } - ind_blk += 2; - nblk_loc ++; - tag = tag_intraLvl + nblk_loc; - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - if (nmsgsToRcv && fstVtx_blk < lstVtx) { - if (iam == fstP) tag --; - prvP = OWNER( globToLoc[fstVtx_blk - 1]); - MPI_Recv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP, - tag, (*symb_comm), &status); -#if ( PRNTlevel>=1 ) - PS->no_shmRcvd += (float) 1; -#endif - nmsgsToRcv --; - } - } - - if (VInfo->filledSep == FILLED_SEP) { - if (mem_error = - dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, lstP - fstP, rcvd_dnsSep, p_nextl, - p_nextu, p_mark, p_nsuper_loc, marker, ndCom, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) - return (mem_error); - } - else if (rcvd_dnsSep) - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, EMPTY, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) - return (mem_error); - return 0; -} - - -static int_t -interLvl_symbfact -( - SuperMatrix *A, /* Input - input matrix A */ - int iam, /* Input - my processor number */ - int lvl, /* Input - current level in the separator tree */ - int szSep, /* Input - size of the current separator (node) */ - int fstP, /* Input - first processor assigned to current node */ - int lstP, /* Input - last processor assigned to current node */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t *p_nextl, - int_t *p_nextu, - int_t *p_nsuper_loc, - int_t *pmark, /* mark for symbfact */ - int_t *marker, /* temp array used for marking */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - comm_symbfact_t *CS,/* infos on communication data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS, - MPI_Comm ndComm, - MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */ - ) -{ - MPI_Status *status; - MPI_Request *request_snd, *request_rcv; - - int nprocsLvl, rcvdP, p, filledSep_lvl; - int toSend, toSendL, toSendU; - int_t *rcv_interLvl; - int_t *snd_interLvl, *snd_interLvl1, *snd_interLvl2, - snd_interLvlSz, snd_LinterLvlSz, snd_vtxLvl; - int_t vtx_elt, update_loc, code_err; - int_t *lsub, *xlsub, *usub, *xusub; - int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz; - int_t n, mark, max_rcvSz; - int_t nextl, nextu, ind_blk, vtx_lid, k, count, nelts, - lstVtxLvl_loc, lstVtxLvl_loc_lid, mem_error; - int_t fstVtx_blk, lstVtx_blk, i, j, vtx, prElt_L, prElt_U, - snd_indBlk, prElt_ind; - int_t fstVtxLvl_loc, nvtcsLvl_loc, maxNvtcsPProc, *globToLoc, - fstVtx, lstVtx; - int ind1, nprocsToRcv, nprocsToSnd, ind2, ind_l, ind_u, ij, ik; - int_t req_ind, sent_msgs, req_ind_snd; - int_t initInfo_loc[2], initInfo_gl[2]; - - /* Initialization */ - n = A->ncol; - fstVtx = fstVtxSep[ind_sizes2]; - lstVtx = fstVtx + sizes[ind_sizes2]; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - nprocsLvl = lstP - fstP; - rcv_interLvl = CS->rcv_interLvl; - snd_interLvl = CS->snd_interLvl; - snd_interLvlSz = CS->snd_interSz[lvl]; - snd_LinterLvlSz = CS->snd_LinterSz[lvl]; - snd_vtxLvl = CS->snd_vtxinter[lvl]; - fstVtxLvl_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc]; - nvtcsLvl_loc = VInfo->nvtcsLvl_loc; - request_snd = NULL; - request_rcv = NULL; - status = NULL; - mark = *pmark; - - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - - /* snd_vtxLvl denotes the first vertex from which iam needs - to send data. - snd_interLvlSz denotes maximum size of the send data, - snd_LinterLvlSz denotes send data corresponding to L part */ - - /* determine maximum size of receive buffer and information - on filled sep */ - if (snd_interLvlSz != 0) { - if (snd_LinterLvlSz == 0) - snd_interLvlSz = 0; - if (snd_interLvlSz - snd_LinterLvlSz == 0) - snd_interLvlSz = 0; - } - - initInfo_loc[0] = snd_interLvlSz; - initInfo_loc[1] = (int_t) VInfo->filledSep; - MPI_Allreduce (initInfo_loc, initInfo_gl, 2, - mpi_int_t, MPI_MAX, ndComm); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl )); - PS->sz_msgsCol += 2; - if (PS->maxsz_msgCol < 2) - PS->maxsz_msgCol = 2; -#endif - max_rcvSz = initInfo_gl[0]; - filledSep_lvl = (int) initInfo_gl[1]; - - if (filledSep_lvl == FILLED_SEPS) { - /* quick return if all upper separators are dense */ - if (VInfo->filledSep != FILLED_SEPS) { - VInfo->filledSep = FILLED_SEPS; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, - fstVtxSep, - EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) - return (mem_error); - } - return 0; - } - - if (max_rcvSz == 0) - /* quick return if no communication necessary */ - return 0; - - /* allocate data for the send buffer */ - if (snd_interLvlSz) - if (CS->snd_bufSz < snd_interLvlSz) { - PS->maxSzBuf += snd_interLvlSz - CS->snd_bufSz; - if (CS->snd_bufSz != 0) - /* not first time allocate memory */ - SUPERLU_FREE (CS->snd_buf); - CS->snd_bufSz = snd_interLvlSz; - if (!(CS->snd_buf = intMalloc_symbfact (snd_interLvlSz))) { - ABORT("Malloc fails for snd_buf[]."); - } - } - - /* snd_interLvl : to which processors the data need to be send - * information setup during the copy of data to be send in the buffer - * rcv_interLvl : from which processors iam receives update data */ - for (p = 2*fstP; p < 2*lstP; p++) - snd_interLvl[p] = EMPTY; - - if (snd_interLvlSz == 0 && nvtcsLvl_loc == 0) { - code_err = MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t, - &(rcv_interLvl[2*fstP]), 2, mpi_int_t, - ndComm); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl )); - PS->sz_msgsCol += 2; - if (PS->maxsz_msgCol < 2) - PS->maxsz_msgCol = 2; -#endif - return 0; - } - - /* in interLvlInfos, - * obtain from which processors iam receives update information */ - update_loc = FALSE; - nextl = 0; - nextu = snd_LinterLvlSz; - if (snd_interLvlSz != 0) { - /* copy data to be send */ - /* find index block from where to send data */ - ind_blk = VInfo->curblk_loc; - while (snd_vtxLvl < VInfo->begEndBlks_loc[ind_blk]) { - ind_blk -= 2; - } - snd_indBlk = ind_blk; - vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] ); - for (; ind_blk < VInfo->curblk_loc; ind_blk += 2) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - if (ind_blk == snd_indBlk) - fstVtx_blk = snd_vtxLvl; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) { - toSendL = FALSE; toSendU = FALSE; - if (xlsub[vtx_lid] != xlsub[vtx_lid+1] && - xusub[vtx_lid] != xusub[vtx_lid+1]) { - k = xlsub[vtx_lid]; - prElt_L = lsub[k]; - j = xusub[vtx_lid]; - prElt_U = usub[j]; - if (prElt_L >= fstVtx || prElt_U >= fstVtx) { - if (prElt_L >= fstVtx) - while (lsub[k] <= prElt_L && k < xlsub[vtx_lid + 1]) { - vtx_elt = lsub[k]; - if (vtx_elt >= fstVtx && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_interLvl[2*p] = TRUE; - toSendL = TRUE; - } - else - update_loc = TRUE; - } - k++; - } - if (prElt_U >= fstVtx) - while (usub[j] <= prElt_U && j < xusub[vtx_lid + 1]) { - vtx_elt = usub[j]; - if (vtx_elt >= fstVtx && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_interLvl[2*p+1] = TRUE; - toSendU = TRUE; - } - else - update_loc = TRUE; - } - j ++; - } - if (toSendL || toSendU) { - /* L(:, vtx) and U(vtx, :) will be send to processors */ - CS->snd_buf[nextu + DIAG_IND] = vtx; - nelts = xusub[vtx_lid+1] - xusub[vtx_lid]; - CS->snd_buf[nextu + NELTS_IND] = nelts; - nextu += 2; - for (j = xusub[vtx_lid]; j < xusub[vtx_lid+1]; j++, nextu ++) { - CS->snd_buf[nextu] = usub[j]; - } - CS->snd_buf[nextl + DIAG_IND] = vtx; - nelts = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - CS->snd_buf[nextl + NELTS_IND] = nelts; - nextl += 2; - for (j = xlsub[vtx_lid]; j < xlsub[vtx_lid+1]; j++, nextl ++) { - CS->snd_buf[nextl] = lsub[j]; - } - } - } - } - } - } - lstVtxLvl_loc = vtx; - lstVtxLvl_loc_lid = vtx_lid; - } - - if (nextl == 0 || nextu - snd_LinterLvlSz == 0) { - for (p = 2*fstP; p < 2*lstP; p++) - snd_interLvl[p] = EMPTY; - } - - nprocsToSnd = 0; - for (p = 2*fstP; p < 2*lstP; p +=2) { - if (snd_interLvl[p] != EMPTY || snd_interLvl[p+1] != EMPTY) { - snd_interLvl[p] = nextl; - snd_interLvl[p+1] = nextu - snd_LinterLvlSz; - nprocsToSnd ++; - } - } - - MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t, - &(rcv_interLvl[2*fstP]), 2, mpi_int_t, ndComm); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl )); - PS->sz_msgsCol += 2 * nprocsLvl; - if (PS->maxsz_msgCol < 2 * nprocsLvl) - PS->maxsz_msgCol = 2 * nprocsLvl; -#endif - - max_rcvSz = 0; - nprocsToRcv = 0; - for (p = 2*fstP; p < 2*lstP; p +=2) { - CS->ptr_rcvBuf[p] = max_rcvSz; - if (rcv_interLvl[p] != EMPTY) - max_rcvSz += rcv_interLvl[p]; - CS->ptr_rcvBuf[p+1] = max_rcvSz; - if (rcv_interLvl[p+1] != EMPTY) - max_rcvSz += rcv_interLvl[p+1]; - if (rcv_interLvl[p] != EMPTY || rcv_interLvl[p+1] != EMPTY) - nprocsToRcv ++; - } - - /* allocate data for the receive buffer */ - if (CS->rcv_bufSz < max_rcvSz) { - PS->maxSzBuf += max_rcvSz - CS->rcv_bufSz; - if (CS->rcv_bufSz != 0) /* not first time allocate memory */ - SUPERLU_FREE (CS->rcv_buf); - CS->rcv_bufSz = max_rcvSz; - if (!(CS->rcv_buf = intMalloc_symbfact (max_rcvSz))) { - ABORT("Malloc fails for rcv_buf[]."); - } - } - - /* allocate memory for status arrays */ - if (nprocsToSnd) - if ( !(request_snd = (MPI_Request*) - SUPERLU_MALLOC(2 * nprocsToSnd * sizeof(MPI_Request)))) - ABORT("Not enough memory when allocating MPI_Request"); - if (nprocsToRcv) - if ( !(request_rcv = (MPI_Request*) - SUPERLU_MALLOC(2 * nprocsToRcv * sizeof(MPI_Request)))) - ABORT("Not enough memory when allocating MPI_Request"); - if (nprocsToRcv || nprocsToSnd) - if ( !(status = (MPI_Status*) - SUPERLU_MALLOC(2 * (lstP-fstP) * sizeof(MPI_Status)))) - ABORT("Not enough memory when allocating MPI_Request"); - - /* determine if we have to send data */ - i = 0; - for (toSend = fstP, p = 2*fstP; p < 2*lstP; toSend++, p+=2) - if (snd_interLvl[p] != EMPTY && toSend != iam) { - MPI_Isend (CS->snd_buf, nextl, mpi_int_t, toSend, - tag_interLvl_LData, (*symb_comm), &(request_snd[2*i])); - MPI_Isend (&(CS->snd_buf[snd_LinterLvlSz]), - nextu - snd_LinterLvlSz, mpi_int_t, toSend, - tag_interLvl_UData, (*symb_comm), &(request_snd[2*i+1])); - i++; -#if ( PRNTlevel>=1 ) - PS->no_msgsSnd += (float) 2; - PS->sz_msgsSnd += (float) (nextl + nextu - snd_LinterLvlSz); - if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl; - if (PS->maxsz_msgSnd < nextu - snd_LinterLvlSz) - PS->maxsz_msgSnd = nextu - snd_LinterLvlSz; -#endif - } - - if (update_loc) { - /* use own data to update symbolic factorization */ - vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] ); - lsub_rcvd = &(lsub[xlsub[vtx_lid]]); - lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid]; - usub_rcvd = &(usub[xusub[vtx_lid]]); - usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid]; - - mem_error = - rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - - lsub_rcvd = &(Llu_symbfact->lsub[xlsub[vtx_lid]]); - lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid]; - usub_rcvd = &(Llu_symbfact->usub[xusub[vtx_lid]]); - usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid]; - lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub; - mem_error = - rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub; - } - - /* post non-blocking receives for all the incoming messages */ - i = 0; - for (rcvdP = fstP, p = 2*fstP; p < 2*lstP; rcvdP++, p += 2) - if (rcv_interLvl[p] != EMPTY) { - lsub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[p]]); - MPI_Irecv (lsub_rcvd, rcv_interLvl[p], mpi_int_t, rcvdP, - tag_interLvl_LData, (*symb_comm), &(request_rcv[i])); - usub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[p+1]]); - MPI_Irecv (usub_rcvd, rcv_interLvl[p+1], mpi_int_t, rcvdP, - tag_interLvl_UData, (*symb_comm), &(request_rcv[i+1])); - i += 2; -#if ( PRNTlevel>=1 ) - PS->no_msgsRcvd += (float) 2; - PS->sz_msgsRcvd += (float) (rcv_interLvl[p] + rcv_interLvl[p+1]); - if (PS->maxsz_msgRcvd < rcv_interLvl[p]) - PS->maxsz_msgRcvd = rcv_interLvl[p]; - if (PS->maxsz_msgRcvd < rcv_interLvl[p+1]) - PS->maxsz_msgRcvd = rcv_interLvl[p+1]; -#endif - } - - /* wait until messages are received and update local data */ - for (i = 0; i < nprocsToRcv; i++) { - MPI_Waitany (2*nprocsToRcv, request_rcv, &ind1, status); - ij = 0; - for (p = fstP; p < lstP; p++) - if (rcv_interLvl[2*p] != EMPTY) { - if (ij <= ind1 && ind1 < ij+2) { - rcvdP = p; p = lstP; - if (ind1 == ij) ind2 = ij+1; - else ind2 = ind1 - 1; - ind_l = ij; ind_u = ij+1; - } - ij += 2; - } - MPI_Get_count (status, mpi_int_t, &ij); - MPI_Wait (&(request_rcv[ind2]), status); - MPI_Get_count (status, mpi_int_t, &ik); - if (ind1 == ind_l) { - lsub_rcvd_sz = ij; - usub_rcvd_sz = ik; - } else { - lsub_rcvd_sz = ik; - usub_rcvd_sz = ij; - } - lsub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP]]); - usub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP+1]]); - - /* use received data to update symbolic factorization information */ - mem_error = - rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - mem_error = - rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - - if (nprocsToSnd) - MPI_Waitall (2*nprocsToSnd, request_snd, status); - - *pmark = mark; - if (request_snd != NULL) SUPERLU_FREE (request_snd); - if (request_rcv != NULL) SUPERLU_FREE (request_rcv); - if (status != NULL) SUPERLU_FREE (status); -} - -static void -freeComm -( - int iam, /* Input -my processor number */ - int nprocs, /* Input -number of procs for the symbolic fact. */ - MPI_Comm *commLvls, /* Input -communicators for the nodes in the sep tree */ - MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */ - ) -{ - int szSep, i, j, k; - int np, npNode, fstP, lstP, ind; - - i = 2 * nprocs - 2; - MPI_Comm_free (&(commLvls[i])); - - szSep = 2; - i -= szSep; - - while (i > 0) { - /* for each level in the separator tree */ - npNode = nprocs / szSep; - fstP = 0; - /* for each node in the level */ - for (j = i; j < i + szSep; j++) { - lstP = fstP + npNode; - if (fstP <= iam && iam < lstP) { - ind = j; - } - fstP += npNode; - } - MPI_Comm_free ( &(commLvls[ind]) ); - szSep *= 2; - i -= szSep; - } -} - -static void -createComm -( - int iam, /* Input -my processor number */ - int nprocs, /* Input -number of procs for the symbolic factorization */ - MPI_Comm *commLvls, /* Output -communicators for the nodes in the sep tree */ - MPI_Comm *symb_comm - ) -{ - int szSep, i, j, jj, k, *pranks; - int np, npNode, fstP, lstP, p, code_err, ind, col, key; - - for (i=0; i < 2*nprocs; i++) - commLvls[i] = MPI_COMM_NULL; - - /* Make a list of the processes in the new communicator. */ - pranks = (int *) SUPERLU_MALLOC( nprocs * sizeof(int) ); - - i = 2 * nprocs - 2; - MPI_Comm_dup ((*symb_comm), &(commLvls[i])); - szSep = 2; - i -= szSep; - - while (i > 0) { - /* for each level in the separator tree */ - npNode = nprocs / szSep; - fstP = 0; - /* for each node in the level */ - for (j = i; j < i + szSep; j++) { - lstP = fstP + npNode; - if (fstP <= iam && iam < lstP) { - ind = j; - key = iam - fstP; - col = fstP; - } - fstP += npNode; - } - MPI_Comm_split ((*symb_comm), col, key, &(commLvls[ind]) ); - - szSep *= 2; - i -= szSep; - } - - SUPERLU_FREE (pranks); -} - -static void -intraLvl_symbfact -( - SuperMatrix *A, /* Input - original matrix A */ - int iam, /* Input - my processor number */ - int lvl, /* Input - current level in the separator tree */ - int szSep, /* Input - size of the current separator(node) */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int fstP, /* Input - first processor assigned to current node */ - int lstP, /* Input - last processor assigned to current node */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *marker, - int_t *p_mark, /* marker used to merge elements of vertices */ - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_neltsZr, /* no of artificial zeros introduced so far */ - int_t *p_neltsTotal, /* no of nonzeros (including artificials) - computed so far */ - int_t *p_nsuper_loc, - MPI_Comm ndComm, - MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */ - ) -{ - int nprocsLvl, p, prvP, rcvP; - int toSend, rcvd_prvP, index_req[2]; - int_t fstVtx_loc_lid, fstVtx_loc, vtx, vtxLvl, curblk_loc, denseSep; - int_t fstVtx_blk, fstVtx_blk_lid, lstVtx_blk, lstVtx_blk_lid, tag; - int_t nvtcs_blk, xusub_end, xlsub_end, prv_fstVtx_blk; - int_t n; - int_t *rcv_intraLvl, *snd_intraLvl; - int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz; - int_t nmsgsRcvd, nmsgsTRcv, sz_msg; - int_t nvtcsLvl_loc, nextl, nextu, ind_blk, snd_vtxLvl, maxNeltsVtx_in; - int_t count, vtx_loc, mem_error, lstBlkRcvd; - int_t fstVtx_blk_loc, fstBlk, vtx_lid, prElt, nelts, j, nvtcs_toUpd; - int_t snd_LinterLvlSz, fstVtx_blk_loc_lid, prElt_ind, maxNmsgsToRcv; - int_t *xlsub, *xusub, *lsub, *usub; - int_t *globToLoc, maxNvtcsPProc, nblk_loc, upd_myD, r, fstVtx_blkCyc; - int_t k, prElt_L, prElt_U, vtx_elt, fstVtx_toUpd; - int intSzMsg; - - MPI_Status status[4]; - MPI_Request request[4]; - - /* Initializations */ - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - - /* max number of msgs this processor can receive during - intraLvl_symbfact routine */ - maxNmsgsToRcv = (lstVtx - fstVtx) / VInfo->maxSzBlk + 1; - maxNeltsVtx_in = VInfo->maxNeltsVtx; - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - n = A->ncol; - nprocsLvl = lstP - fstP; - rcv_intraLvl = CS->rcv_intraLvl; - snd_intraLvl = CS->snd_intraLvl; - nvtcsLvl_loc = VInfo->nvtcsLvl_loc; - nmsgsTRcv = 0; - nmsgsRcvd = 0; - nblk_loc = 0; - nvtcs_toUpd = nvtcsLvl_loc; - fstVtx_blk = fstVtx; - denseSep = FALSE; - - /* determine first vertex that belongs to fstP */ - k = fstVtx; - fstVtx_blkCyc = n; - while (k < lstVtx && fstVtx_blkCyc == n) { - p = OWNER( globToLoc[k] ); - if (p == fstP) - fstVtx_blkCyc = k; - k += VInfo->maxSzBlk; - } - - for (p = fstP; p < lstP; p++) - rcv_intraLvl[p] = 0; - - for (r = 0; r < 3; r++) - request[r] = MPI_REQUEST_NULL; - - fstVtx_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc]; - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - vtx = fstVtx_loc; - if (fstVtx_loc >= fstVtx_blkCyc) - nblk_loc = 1; - while (VInfo->begEndBlks_loc[VInfo->curblk_loc] < lstVtx && !VInfo->filledSep) { - CS->snd_intraSz = 0; - CS->snd_LintraSz = 0; - - lstBlkRcvd = FALSE; - prv_fstVtx_blk = fstVtx_blk; - fstVtx_blk = VInfo->begEndBlks_loc[VInfo->curblk_loc]; - lstVtx_blk = VInfo->begEndBlks_loc[VInfo->curblk_loc + 1]; - fstVtx_toUpd = VInfo->begEndBlks_loc[VInfo->curblk_loc + 2]; - fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - lstVtx_blk_lid = LOCAL_IND( globToLoc[lstVtx_blk - 1] + 1); - nvtcs_blk = lstVtx_blk - fstVtx_blk; - nvtcs_toUpd -= nvtcs_blk; - nmsgsTRcv = n; - VInfo->maxNeltsVtx -= fstVtx_blk - prv_fstVtx_blk; - - index_req[0] = EMPTY; - for (r = 0; r < 3; r++) - request[r] = MPI_REQUEST_NULL; - if (fstVtx_blk != fstVtx) { - /* if not the first vertex of the level */ - prvP = OWNER( globToLoc[fstVtx_blk - 1] ); - rcvd_prvP = FALSE; - /* receive info on number messages to receive */ - tag = tag_intraLvl + nblk_loc; - if (iam == fstP) tag --; - - MPI_Irecv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP, - tag, (*symb_comm), &(request[1])); - - while (!rcvd_prvP || nmsgsRcvd < nmsgsTRcv) { - if (index_req[0] != 1) { - MPI_Irecv (&sz_msg, 1, mpi_int_t, - MPI_ANY_SOURCE, tag_intraLvl_szMsg, - (*symb_comm), &(request[0])); - if (sz_msg > INT_MAX) - ABORT("ERROR in intraLvl_symbfact size to send > INT_MAX\n"); - } - MPI_Waitany (2, request, index_req, status); - if (index_req[0] == 1) { - /* receive information on no msgs to receive */ -#if ( PRNTlevel>=1 ) - PS->no_shmRcvd ++; -#endif - rcvd_prvP = TRUE; - nmsgsTRcv = rcv_intraLvl[iam]; - /* if dense separator was detected by one of the - previous processors ... */ - if (nmsgsTRcv > maxNmsgsToRcv) { - VInfo->filledSep = (int) nmsgsTRcv / maxNmsgsToRcv; - nmsgsTRcv = nmsgsTRcv % maxNmsgsToRcv; - } - - if (nmsgsTRcv == nmsgsRcvd) { - /* MPI_Cancel (&(request[0])); */ - MPI_Send (&r, 1, mpi_int_t, iam, - tag_intraLvl_szMsg, (*symb_comm)); - MPI_Wait (&(request[0]), status); - } - } - if (index_req[0] == 0) { - nmsgsRcvd ++; - if (nmsgsTRcv == nmsgsRcvd) lstBlkRcvd = TRUE; - rcvP = status->MPI_SOURCE; - - /* allocate enough space to receive data */ - if (CS->rcv_bufSz < sz_msg) { - PS->maxSzBuf += sz_msg - CS->rcv_bufSz; - if (CS->rcv_bufSz != 0) - /* not first time allocate memory */ - SUPERLU_FREE (CS->rcv_buf); - CS->rcv_bufSz = sz_msg; - if (!(CS->rcv_buf = intMalloc_symbfact (sz_msg))) { - ABORT("Malloc fails for rcv_buf[]."); - } - } - - /* use received data to update symbolic factorization */ - lsub_rcvd = CS->rcv_buf; - MPI_Recv (lsub_rcvd, sz_msg, mpi_int_t, - rcvP, tag_intraLvl_LData, (*symb_comm), status); - MPI_Get_count (status, mpi_int_t, &intSzMsg); - lsub_rcvd_sz = intSzMsg; - usub_rcvd = &(CS->rcv_buf[lsub_rcvd_sz]); - MPI_Recv (usub_rcvd, sz_msg - lsub_rcvd_sz, - mpi_int_t, rcvP, - tag_intraLvl_UData, (*symb_comm), status); - MPI_Get_count (status, mpi_int_t, &intSzMsg); - usub_rcvd_sz = intSzMsg; -#if ( PRNTlevel>=1 ) - PS->no_shmRcvd ++; - PS->no_msgsRcvd += (float) 2; - PS->sz_msgsRcvd += (float) sz_msg; - if (PS->maxsz_msgRcvd < lsub_rcvd_sz) PS->maxsz_msgRcvd = lsub_rcvd_sz; - if (PS->maxsz_msgRcvd < usub_rcvd_sz) PS->maxsz_msgRcvd = usub_rcvd_sz; -#endif - - if (!lstBlkRcvd) { - mem_error = - rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd, - 1, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - mem_error = - rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd, - 0, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - } - } - } - - if (VInfo->filledSep) { - mem_error = - denseSep_symbfact (1, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc, - p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, - ndComm, symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS); - } - else { - /* compute symbolic factorization for this block */ - if (!lstBlkRcvd) { - lsub_rcvd = NULL; usub_rcvd = NULL; - } - - blk_symbfact (A, iam, lvl, - szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - fstVtx_loc, fstVtx_blk, lstVtx_blk, - lsub_rcvd, lsub_rcvd_sz, usub_rcvd, usub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, CS, PS, - marker, p_mark, - p_nextl, p_nextu, p_neltsZr, p_neltsTotal, - p_nsuper_loc); - lsub = Llu_symbfact->lsub; - usub = Llu_symbfact->usub; - - if (lstVtx_blk != lstVtx) { - /* if this is not the last block of the level */ - if (VInfo->filledSep == FILLED_SEPS || - ( VInfo->filledSep == FILLED_SEP && - ((lstVtx - lstVtx_blk > VInfo->maxSzBlk * nprocsLvl && nblk_loc > 0) || - (lstVtx - fstVtx_blkCyc > VInfo->maxSzBlk * nprocsLvl && nblk_loc == 0)))) - /* if current separator is dense and this is not the last block, - then ... */ - denseSep = TRUE; - else - /* separator dense but not enough uncomputed blocks - in the separator to take advantage of it */ - VInfo->filledSep = FALSE; - - if (VInfo->filledSep == FILLED_SEPS) { - for (p = fstP; p < lstP; p++) - rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + rcv_intraLvl[p]; - denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc, - p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, - symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS); - } - else { - /* send blk to next procs and update the rest of my own blocks */ - if (lstBlkRcvd) { - mem_error = - rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 1, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - mem_error = - rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 0, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - - upd_myD = FALSE; - /* determine processors to which send this block - and copy data to be sent */ - for (p = fstP; p < lstP; p++) - snd_intraLvl[p] = FALSE; - nextl = 0; - nextu = nextl + CS->snd_LintraSz; - - /* allocate enough space to receive data */ - if (CS->rcv_bufSz < CS->snd_intraSz) { - PS->maxSzBuf += CS->snd_intraSz - CS->rcv_bufSz; - if (CS->rcv_bufSz != 0) - /* not first time allocate memory */ - SUPERLU_FREE (CS->rcv_buf); - CS->rcv_bufSz = CS->snd_intraSz; - if (!(CS->rcv_buf = intMalloc_symbfact (CS->snd_intraSz))) { - ABORT("Malloc fails for rcv_buf[]."); - } - } - - for (vtx = fstVtx_blk, vtx_lid = fstVtx_blk_lid; - vtx < lstVtx_blk; vtx++, vtx_lid ++) { - toSend = FALSE; - k = xlsub[vtx_lid]; - prElt_L = lsub[k]; - j = xusub[vtx_lid]; - prElt_U = usub[j]; - - if (prElt_L >= lstVtx_blk || prElt_U >= lstVtx_blk) { - if (vtx == lstVtx_blk - 1) { - xlsub_end = *p_nextl; - xusub_end = *p_nextu; - } - else { - xlsub_end = xlsub[vtx_lid + 1]; - xusub_end = xusub[vtx_lid + 1]; - } - if (prElt_L >= lstVtx_blk) { - while (lsub[k] <= prElt_L && k < xlsub_end) { - vtx_elt = lsub[k]; - if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_intraLvl[p] = TRUE; - toSend = TRUE; - } - else { - upd_myD = TRUE; - } - } - k++; - } - } - if (prElt_U >= lstVtx_blk) { - while (usub[j] <= prElt_U && j < xusub_end) { - vtx_elt = usub[j]; - if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_intraLvl[p] = TRUE; - toSend = TRUE; - } - else { - upd_myD = TRUE; - } - } - j ++; - } - } - if (toSend) { - /* L(:, vtx) and U(vtx, :) will be send to processors */ - nelts = xusub_end - xusub[vtx_lid]; - CS->rcv_buf[nextu + DIAG_IND] = vtx; - CS->rcv_buf[nextu + NELTS_IND] = nelts; - nextu += 2; - for (j = xusub[vtx_lid]; j < xusub_end; j++) { - CS->rcv_buf[nextu] = usub[j]; nextu ++; - } - - nelts = xlsub_end - xlsub[vtx_lid]; - CS->rcv_buf[nextl + DIAG_IND] = vtx; - CS->rcv_buf[nextl + NELTS_IND] = nelts; - nextl += 2; - for (j = xlsub[vtx_lid]; j < xlsub_end; j++) { - CS->rcv_buf[nextl] = lsub[j]; nextl ++; - } - } - } - } - for (p = fstP; p < lstP; p++) - if (snd_intraLvl[p]) - rcv_intraLvl[p] ++; - - if (VInfo->filledSep == FILLED_SEP) { - for (p = fstP; p < lstP; p++) - rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + - rcv_intraLvl[p]; - } - else { - /* send to the owner of the next block info on no of messages */ - p = OWNER( globToLoc[lstVtx_blk] ); - tag = tag_intraLvl + nblk_loc; - - MPI_Isend (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p, - tag, (*symb_comm), request); -#if ( PRNTlevel>=1 ) - PS->no_shmSnd ++; -#endif - } - - /* there is data to be send */ - sz_msg = nextl + nextu - CS->snd_LintraSz; - for (p = fstP; p < lstP; p++) { - if (p != iam && snd_intraLvl[p]) { - MPI_Isend (&sz_msg, 1, mpi_int_t, p, - tag_intraLvl_szMsg, (*symb_comm), &(request[1])); - MPI_Isend (CS->rcv_buf, nextl, mpi_int_t, p, - tag_intraLvl_LData, (*symb_comm), &(request[2])); - MPI_Isend (&(CS->rcv_buf[CS->snd_LintraSz]), - nextu - CS->snd_LintraSz, mpi_int_t, p, - tag_intraLvl_UData, (*symb_comm), &(request[3])); - MPI_Waitall(3, &(request[1]), &(status[1])); -#if ( PRNTlevel>=1 ) - PS->no_shmSnd ++; - PS->no_msgsSnd += (float) 2; - PS->sz_msgsSnd += (float) sz_msg; - if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl; - if (PS->maxsz_msgSnd < nextu - CS->snd_LintraSz) - PS->maxsz_msgSnd = nextu - CS->snd_LintraSz; -#endif - } - } - if (VInfo->filledSep != FILLED_SEP) { - MPI_Wait (request, status); - } - - /* update rest of vertices */ - if (upd_myD) { - lsub_rcvd_sz = (*p_nextl) - xlsub[fstVtx_blk_lid]; - lsub_rcvd = &(lsub[xlsub[fstVtx_blk_lid]]); - usub_rcvd_sz = (*p_nextu) - xusub[fstVtx_blk_lid]; - usub_rcvd = &(usub[xusub[fstVtx_blk_lid]]); - - mem_error = - rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, fstVtx_blk, lstVtx_blk, - EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 1, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - lsub_rcvd = &(lsub[xlsub[fstVtx_blk_lid]]); - mem_error = - rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, fstVtx_blk, lstVtx_blk, - EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 0, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - if (VInfo->filledSep == FILLED_SEP) - denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc, - p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, - symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS); - } - } - } - VInfo->curblk_loc += 2; - nblk_loc ++; - } - - /* update maxNeltsVtx */ - VInfo->maxNeltsVtx = maxNeltsVtx_in - lstVtx + fstVtx; - - /* if current separator dense, then reset value of filledSep */ - if (VInfo->filledSep == FILLED_SEP) - VInfo->filledSep = FALSE; -} - -static void -symbfact_free -( - int iam, /* Input - my processor number */ - int nprocs, /* Input - number of processors for the symbolic factorization */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS - ) -{ - /* free memory corresponding to prune structure */ - if (Llu_symbfact->szLsubPr != 0) - SUPERLU_FREE( Llu_symbfact->lsubPr ); - if (Llu_symbfact->szUsubPr != 0) - SUPERLU_FREE( Llu_symbfact->usubPr ); - if (Llu_symbfact->xlsubPr != NULL) - SUPERLU_FREE( Llu_symbfact->xlsubPr ); - if (Llu_symbfact->xusubPr != NULL) - SUPERLU_FREE( Llu_symbfact->xusubPr ); - - if (Llu_symbfact->xlsub_rcvd != NULL) - SUPERLU_FREE( Llu_symbfact->xlsub_rcvd); - if (Llu_symbfact->xusub_rcvd != NULL) - SUPERLU_FREE( Llu_symbfact->xusub_rcvd); - - if (Llu_symbfact->cntelt_vtcs != NULL) - SUPERLU_FREE( Llu_symbfact->cntelt_vtcs); - if (Llu_symbfact->cntelt_vtcsA_lvl != NULL) - SUPERLU_FREE( Llu_symbfact->cntelt_vtcsA_lvl); - - if (CS->rcv_bufSz != 0) - SUPERLU_FREE( CS->rcv_buf ); - if (CS->snd_bufSz != 0) - SUPERLU_FREE( CS->snd_buf ); - - SUPERLU_FREE( VInfo->begEndBlks_loc); - SUPERLU_FREE( CS->rcv_interLvl); - SUPERLU_FREE( CS->snd_interLvl); - SUPERLU_FREE( CS->ptr_rcvBuf); - SUPERLU_FREE( CS->rcv_intraLvl); - SUPERLU_FREE( CS->snd_intraLvl); - SUPERLU_FREE( CS->snd_interSz); - SUPERLU_FREE( CS->snd_LinterSz); - SUPERLU_FREE( CS->snd_vtxinter); -} - -static void -estimate_memUsage -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - mem_usage_t *symb_mem_usage, - float *p_totalMemLU, /* Output -memory used for symbolic factorization */ - float *p_overestimMem, /* Output -memory allocated during to right looking - overestimation memory usage */ - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS - ) -{ - int_t nvtcs_loc, lword, nsuper_loc; - float lu_mem, other_mem, overestimMem; - - nvtcs_loc = VInfo->nvtcs_loc; - nsuper_loc = Pslu_freeable->supno_loc[nvtcs_loc]; - lword = sizeof(int_t); - - /* memory for xlsub, xusub, supno_loc, cntelt_vtcs */ - lu_mem = 4.0 * (float) nvtcs_loc * (float) lword; - /* memory for xlsubPr, xusubPr */ - lu_mem += 2.0 * (float) VInfo->maxNvtcsNds_loc * (float) lword; - - if (PS->estimLSz < Llu_symbfact->xlsub[nvtcs_loc]) - PS->estimLSz = Llu_symbfact->xlsub[nvtcs_loc]; - if (PS->estimUSz < Llu_symbfact->xusub[nvtcs_loc]) - PS->estimUSz = Llu_symbfact->xusub[nvtcs_loc]; - - lu_mem += (float) PS->estimLSz * lword; - lu_mem += (float) PS->estimUSz * lword; - lu_mem += (float) PS->maxSzLPr * lword; - lu_mem += (float) PS->maxSzUPr * lword; - lu_mem += (float) PS->szDnsSep * lword; - /* memory for globToLoc, tempArray */ - lu_mem += (float) 2* (float) n * lword; - lu_mem += (float) PS->maxSzBuf * lword; - - overestimMem = (float) (PS->estimLSz - Llu_symbfact->xlsub[nvtcs_loc]) * lword; - overestimMem += (float) (PS->estimUSz - Llu_symbfact->xusub[nvtcs_loc]) * lword; - - *p_totalMemLU = lu_mem; - *p_overestimMem = overestimMem; - - symb_mem_usage->for_lu = (float) ((3 * nvtcs_loc + 2 * nsuper_loc) * lword); - symb_mem_usage->for_lu += (float) (Llu_symbfact->xlsub[nvtcs_loc] * lword); - symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword); - symb_mem_usage->total = lu_mem; -} - - -static int_t * -intMalloc_symbfact(int_t n) -{ - int_t *buf; - if (n == 0) - buf = NULL; - else - buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); - return buf; -} - -static int_t * -intCalloc_symbfact(int_t n) -{ - int_t *buf; - register int_t i; - - if (n == 0) - buf = NULL; - else - buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); - if ( buf ) - for (i = 0; i < n; i++) buf[i] = 0; - return (buf); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,5125 +0,0 @@ - -/* - * -- Parallel symbolic factorization routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 - * INRIA France - January 2004 - * Laura Grigori - * - * November 1, 2007 - */ - - -/* - * The function symbfact_dist implements the parallel symbolic factorization - * algorithm described in the paper: - * - * Parallel Symbolic Factorization for Sparse LU with Static Pivoting, - * Laura Grigori, James W. Demmel and Xiaoye S. Li, - * Pages 1289-1314, SIAM Journal on Scientific Computing, Volume 29, Issue 3. - * - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include -#include -#include "superlu_ddefs.h" -#include "psymbfact.h" - -/* - * Internal protypes - */ - -static int_t * -intMalloc_symbfact(int_t ); - -static int_t * -intCalloc_symbfact(int_t ); - -static int_t -initParmsAndStats -(psymbfact_stat_t *PS); - -static void -estimate_memUsage -(int_t, int, mem_usage_t *, float *, float *, - Pslu_freeable_t *, Llu_symbfact_t *, - vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *); - -static void -symbfact_free -(int, int, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *); - -static int_t -denseSep_symbfact -(int , int_t, int, int, int, int_t *, int_t *, int, - int, int, int_t, int_t, int_t *, int_t *, int_t *, - int_t *, int_t *, MPI_Comm, MPI_Comm *, Llu_symbfact_t *, - Pslu_freeable_t *_freeable, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t * ); - -static int_t -dnsUpSeps_symbfact -(int_t, int, int, int, int, int_t *, int_t *, int_t, - Llu_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *); - -static void -intraLvl_symbfact -(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int, - int, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *); - -static void -initLvl_symbfact -(int_t, int, int_t, int_t, Pslu_freeable_t *, - Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, - int_t *, int_t, int_t); - -static void -createComm (int, int, MPI_Comm *, MPI_Comm *); - -static void -domain_symbfact -(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, - int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, - comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *); - -static float -allocPrune_domain -(int_t, int_t, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -static float -allocPrune_lvl -(Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -static int -symbfact_alloc -(int_t, int, Pslu_freeable_t *, Llu_symbfact_t *, - vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *); - -static float -symbfact_mapVtcs -(int, int, int, SuperMatrix *, int_t *, int_t *, - Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, int_t, psymbfact_stat_t *); - -static void -symbfact_distributeMatrix -(int, int, int, SuperMatrix *, int_t *, int_t *, matrix_symbfact_t *, - Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, MPI_Comm *); - -static int_t -interLvl_symbfact -(SuperMatrix *, int, int, int, int, int, int, int, - int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, - Llu_symbfact_t *, Pslu_freeable_t*, comm_symbfact_t *, vtcsInfo_symbfact_t *, - psymbfact_stat_t *, MPI_Comm, MPI_Comm *); - -static float -cntsVtcs -(int_t, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, - int_t *, int_t *, int_t *, psymbfact_stat_t *, MPI_Comm *); - -/************************************************************************/ -float symbfact_dist -/************************************************************************/ -( - int nprocs_num, /* Input - no of processors */ - int nprocs_symb, /* Input - no of processors for the symbolic - factorization */ - SuperMatrix *A, /* Input - distributed input matrix */ - int_t *perm_c, /* Input - column permutation */ - int_t *perm_r, /* Input - row permutation */ - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, - global to local indexing information */ - MPI_Comm *num_comm, /* Input - communicator for numerical factorization */ - MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ - mem_usage_t *symb_mem_usage - ) -{ -/* - * Purpose - * ======= - * symbfact_dist() performs symbolic factorization of matrix A suitable - * for performing the supernodal Gaussian elimination with no pivoting (GEPP). - * This routine computes the structure of one column of L and one row of U - * at a time. It uses: - * o distributed input matrix - * o supernodes - * o symmetric structure pruning - * - * - * Arguments - * ========= - * - * nprocs_num (input) int - * Number of processors SuperLU_DIST is executed on, and the input - * matrix is distributed on. - * - * nprocs_symb (input) int - * Number of processors on which the symbolic factorization is - * performed. It is equal to the number of independent domains - * idenfied in the graph partitioning algorithm executed - * previously and has to be a power of 2. It corresponds to - * number of leaves in the separator tree. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The - * number of the linear equations is A->nrow. Matrix A is - * distributed in NRformat_loc format. - * Matrix A is not yet permuted by perm_c. - * - * perm_c (input) int_t* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int_t* - * Row permutation vector of size A->nrow, which defines the - * permutation matrix Pr; perm_r[i] = j means column i of A is - * in position j in Pr*A. - * - * sizes (input) int_t* - * Contains the number of vertices in each separator. - * - * fstVtxSep (input) int_t* - * Contains first vertex for each separator. - * - * Pslu_freeable (output) Pslu_freeable_t* - * Returns the local L and U structure, and global to local - * information on the indexing of the vertices. Contains all - * the information necessary for performing the data - * distribution towards the numeric factorization. - * - * num_comm (input) MPI_Comm* - * Communicator for numerical factorization - * - * symb_comm (input) MPI_Comm* - * Communicator for symbolic factorization - * - * symb_mem_usage (input) mem_usage_t * - * Statistics on memory usage. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the symbolic factorization. - * > 0, number of bytes allocated when out of memory. - * - * Sketch of the algorithm - * ======================= - * - * Distrbute the vertices on the processors using a subtree to - * subcube algorithm. - * - * Redistribute the structure of the input matrix A according to the - * subtree to subcube computed previously for the symbolic - * factorization routine. This implies in particular a distribution - * from nprocs_num processors to nprocs_symb processors. - * - * Perform symbolic factorization guided by the separator tree provided by - * a graph partitioning algorithm. The symbolic factorization uses a - * combined left-looking, right-looking approach. - * - */ - NRformat_loc *Astore; - int iam, szSep, fstP, lstP, npNode, nlvls, lvl, p, iSep, jSep; - int iinfo; /* return code */ - int_t m, n; - int_t nextl, nextu, neltsZr, neltsTotal, nsuper_loc, szLGr, szUGr; - int_t ind_blk, nsuper, vtx, min_mn, nnzL, nnzU, szsn; - float stat_loc[23], stat_glob[23], mem_glob[15]; - - Llu_symbfact_t Llu_symbfact; /* local L and U and pruned L and U data structures */ - vtcsInfo_symbfact_t VInfo; /* local information on number of blocks, - number of vertices in a block etc */ - matrix_symbfact_t AS; /* temporary storage for the input matrix after redistribution */ - comm_symbfact_t CS; /* information on communication */ - /* relaxation parameters (for future release) and - statistics collected during the symbolic factorization */ - psymbfact_stat_t PS; - /* temp array of size n, used as a marker by the subroutines */ - int_t *tempArray; - int_t i, j, k; - int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc; - int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd; - float totalMemLU, overestimMem; - MPI_Comm *commLvls; - - /* maximum block size */ - int_t maxSzBlk; - float flinfo; -#if ( PRNTlevel >= 1) - float stat_msgs_l[10], stat_msgs_g[10]; -#endif -#if ( PROFlevel>=1 ) - double t, t_symbFact[3], t_symbFact_loc[3]; - double *time_lvlsT, *time_lvls, t1, t2, time_lvlsg[9]; -#endif - - /* Initialization */ - MPI_Comm_rank ((*num_comm), &iam); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter psymbfact()"); -#endif - initParmsAndStats (&PS); - if (nprocs_symb != 1) { - if (!(commLvls = (MPI_Comm *) SUPERLU_MALLOC(2*nprocs_symb*sizeof(MPI_Comm)))) { - fprintf (stderr, "Malloc fails for commLvls[]."); - return (PS.allocMem); - } - PS.allocMem += 2 * nprocs_symb * sizeof(MPI_Comm); - } - else { - commLvls = NULL; - } - - nlvls = (int) log2( (double)nprocs_num ) + 1; -#if ( PROFlevel>=1 ) - time_lvlsT = (double *) SUPERLU_MALLOC(3*nprocs_symb*(nlvls+1) - * sizeof(double)); - time_lvls = (double *) SUPERLU_MALLOC(3*(nlvls+1) * sizeof(double)); - if (!time_lvls || !time_lvlsT) { - fprintf (stderr, "Malloc fails for time_lvls[]."); - return (PS.allocMem); - } - PS.allocMem += (3*nprocs_symb*(nlvls+1) + 3*(nlvls+1)) * sizeof(double); -#endif - - VInfo.xlsub_nextLvl = 0; - VInfo.xusub_nextLvl = 0; - VInfo.maxSzBlk = sp_ienv_dist(3); - maxSzBlk = VInfo.maxSzBlk; - - mark = EMPTY; - nsuper_loc = 0; - nextl = 0; nextu = 0; - neltsZr = 0; neltsTotal = 0; - - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN( m, n ); - - if (!(tempArray = intMalloc_symbfact(n))) { - fprintf (stderr, "Malloc fails for tempArray[].\n"); - return (PS.allocMem); - } - PS.allocMem += n * sizeof(int_t); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - /* Distribute vertices on processors */ - if ((flinfo = - symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes, - Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0) - return (flinfo); - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - - /* Redistribute matrix A on processors following the distribution found - in symbfact_mapVtcs. Store the redistributed A temporarily into AS */ - symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb, A, - perm_c, perm_r, &AS, - Pslu_freeable, &VInfo, tempArray, num_comm); - - /* THE REST OF THE SYMBOLIC FACTORIZATION IS EXECUTED ONLY BY NPROCS_SYMB - PROCESSORS */ - if ( iam < nprocs_symb ) { - -#if ( PROFlevel>=1 ) - t_symbFact_loc[0] = SuperLU_timer_() - t; - t = SuperLU_timer_(); - t_symbFact_loc[1] = t; -#endif - - /* Allocate storage common to the symbolic factor routines */ - if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, - &Llu_symbfact, &VInfo, &CS, &PS)) - return (PS.allocMem); - - /* Copy the redistributed input matrix AS at the end of the memory buffer - allocated to store L and U. That is, copy (AS.x_ainf, AS.ind_ainf) in - (xlsub, lsub), (AS.x_asup, AS.ind_asup) in (xusub, usub). Free the - memory used to store the input matrix */ - nnz_ainf_loc = VInfo.nnz_ainf_loc; - nnz_asup_loc = VInfo.nnz_asup_loc; - j = Llu_symbfact.szUsub - VInfo.nnz_asup_loc; - k = Llu_symbfact.szLsub - VInfo.nnz_ainf_loc; - for (i = 0; i <= VInfo.nvtcs_loc; i++) { - Llu_symbfact.xusub[i] = AS.x_asup[i] + j; - Llu_symbfact.xlsub[i] = AS.x_ainf[i] + k; - } - - for (i = 0; i < VInfo.nnz_asup_loc; i++, j++) - Llu_symbfact.usub[j] = AS.ind_asup[i]; - for (i = 0; i < VInfo.nnz_ainf_loc; i++, k++) - Llu_symbfact.lsub[k] = AS.ind_ainf[i]; - SUPERLU_FREE( AS.x_ainf ); - SUPERLU_FREE( AS.x_asup ); - SUPERLU_FREE( AS.ind_ainf ); - SUPERLU_FREE( AS.ind_asup ); - - if (nprocs_symb != 1) { - createComm (iam, nprocs_symb, commLvls, symb_comm); -#if ( PROFlevel>=1 ) - t_symbFact_loc[2] = SuperLU_timer_(); -#endif - if ((flinfo = cntsVtcs (n, iam, nprocs_symb, Pslu_freeable, &Llu_symbfact, - &VInfo, tempArray, fstVtxSep, sizes, &PS, commLvls)) > 0) - return (flinfo); - -#if ( PROFlevel>=1 ) - t_symbFact_loc[2] = SuperLU_timer_() - t_symbFact_loc[2]; -#endif - } - - /* set to EMPTY marker[] array */ - for (i = 0; i < n; i++) - tempArray[i] = EMPTY; - - szSep = nprocs_symb; - iSep = 0; - lvl = 0; - while (szSep >= 1) { - /* for each level in the separator tree */ - npNode = nprocs_symb / szSep; - fstP = 0; - /* for each node in the level */ - for (jSep = iSep; jSep < iSep + szSep; jSep++) { - fstVtx = fstVtxSep[jSep]; - lstVtx = fstVtx + sizes[jSep]; - /* if this is the first level */ - if (szSep == nprocs_symb) { - /* compute symbolic factorization for my domain */ - if (fstP == iam) { - /* allocate storage for the pruned structures */ -#if ( PROFlevel>=1 ) - t1 = SuperLU_timer_(); -#endif - if ((flinfo = allocPrune_domain (fstVtx, lstVtx, - &Llu_symbfact, &VInfo, &PS)) > 0) - return (flinfo); - if (fstVtx < lstVtx) - VInfo.fstVtx_nextLvl = VInfo.begEndBlks_loc[2]; - - domain_symbfact - (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstVtx, lstVtx, - Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, tempArray, - &mark, &nextl, &nextu, &neltsZr, &neltsTotal, &nsuper_loc); - - PS.estimLSz = nextl; - PS.estimUSz = nextu; - if (nprocs_symb != 1) - if((flinfo = allocPrune_lvl (&Llu_symbfact, &VInfo, &PS)) > 0) - return (flinfo); -#if ( PROFlevel>=1 ) - t2 = SuperLU_timer_(); - time_lvls[lvl] = 0.; time_lvls[lvl+1] = 0.; - time_lvls[lvl + 2] = t2 - t1; -#endif - } - } - else { - lstP = fstP + npNode; - if (fstP <= iam && iam < lstP) { -#if ( PROFlevel>=1 ) - t1 = SuperLU_timer_(); -#endif - if (VInfo.filledSep != FILLED_SEPS) - initLvl_symbfact(n, iam, fstVtx, lstVtx, - Pslu_freeable, &Llu_symbfact, &VInfo, &PS, commLvls[jSep], - tempArray, nextl, nextu); -#if ( PROFlevel>=1 ) - t2 = SuperLU_timer_(); - time_lvls[3*lvl] = t2 - t1; -#endif - interLvl_symbfact (A, iam, lvl, szSep, fstP, lstP, - iSep, jSep, sizes, fstVtxSep, - &nextl, &nextu, &nsuper_loc, &mark, tempArray, - &Llu_symbfact, Pslu_freeable, &CS, &VInfo, &PS, - commLvls[jSep], symb_comm); -#if ( PROFlevel>=1 ) - t1 = SuperLU_timer_(); - time_lvls[3*lvl+1] = t1 - t2; -#endif - if (VInfo.filledSep != FILLED_SEPS) - intraLvl_symbfact - (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstP, lstP, - fstVtx, lstVtx, Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, - tempArray, &mark, &nextl, &nextu, &neltsZr, &neltsTotal, - &nsuper_loc, commLvls[jSep], symb_comm); -#if ( PROFlevel>=1 ) - t2 = SuperLU_timer_(); - time_lvls[3*lvl+2] = t2 - t1; -#endif - } - } - fstP += npNode; - } - iSep += szSep; - szSep = szSep / 2; - lvl ++; - } - - SUPERLU_FREE( tempArray ); -#if 0 - SUPERLU_FREE( commLvls ); -#else /* XSL correction */ - if ( commLvls ) SUPERLU_FREE( commLvls ); -#endif - - /* Set up global information and collect statistics */ - if (PS.maxSzLPr < Llu_symbfact.indLsubPr) - PS.maxSzLPr = Llu_symbfact.indLsubPr; - if (PS.maxSzUPr < Llu_symbfact.indUsubPr) - PS.maxSzUPr = Llu_symbfact.indUsubPr; - - Llu_symbfact.xlsub[VInfo.nvtcs_loc] = nextl; - Llu_symbfact.xusub[VInfo.nvtcs_loc] = nextu; - fill_rcmd = SUPERLU_MAX( nextl / nnz_ainf_loc, nextu / nnz_asup_loc) + 1; - Pslu_freeable->xsup_beg_loc = intMalloc_dist (nsuper_loc+1); - Pslu_freeable->xsup_end_loc = intMalloc_dist (nsuper_loc+1); - if (!Pslu_freeable->xsup_beg_loc || !Pslu_freeable->xsup_end_loc) { - fprintf (stderr, "Malloc fails for xsup_beg_loc, xsup_end_loc."); - return (PS.allocMem); - } - PS.allocMem += 2 * (nsuper_loc+1) * sizeof(int_t); - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - nnzL = 0; nnzU = 0; - - i = 0; - nsuper = 0; - ind_blk = 0; - for (ind_blk = 0; ind_blk < VInfo.nblks_loc; ind_blk ++) { - fstVtx = VInfo.begEndBlks_loc[2 * ind_blk]; - lstVtx = VInfo.begEndBlks_loc[2 * ind_blk + 1]; - fstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[fstVtx] ); - nsuper = Pslu_freeable->supno_loc[fstVtx_lid]; - Pslu_freeable->xsup_beg_loc[nsuper] = fstVtx; - szsn = 1; - if (INT_MAX - nnzL <= Llu_symbfact.xlsub[fstVtx_lid + 1] - - Llu_symbfact.xlsub[fstVtx_lid]) - printf ("PE[%d] ERR nnzL %d\n", iam, nnzL); - if (INT_MAX - nnzU <= Llu_symbfact.xusub[fstVtx_lid + 1] - - Llu_symbfact.xusub[fstVtx_lid]) - printf ("PE[%d] ERR nnzU %d\n", iam, nnzU); - - j = Llu_symbfact.xlsub[fstVtx_lid + 1] - Llu_symbfact.xlsub[fstVtx_lid]; - k = Llu_symbfact.xusub[fstVtx_lid + 1] - Llu_symbfact.xusub[fstVtx_lid]; - nnzL += j; - nnzU += k; - - for (vtx = fstVtx + 1, vtx_lid = fstVtx_lid + 1; - vtx < lstVtx; vtx++, vtx_lid ++) { - if (Pslu_freeable->supno_loc[vtx_lid] != nsuper) { - nsuper = Pslu_freeable->supno_loc[vtx_lid]; - Pslu_freeable->xsup_end_loc[nsuper-1] = vtx; - Pslu_freeable->xsup_beg_loc[nsuper] = vtx; - szsn = 1; - j = Llu_symbfact.xlsub[vtx_lid + 1] - Llu_symbfact.xlsub[vtx_lid]; - k = Llu_symbfact.xusub[vtx_lid + 1] - Llu_symbfact.xusub[vtx_lid]; - } - else { - szsn ++; - } - nnzL += j - szsn + 1; - nnzU += k - szsn + 1; - } - Pslu_freeable->xsup_end_loc[nsuper] = lstVtx; - } - Pslu_freeable->supno_loc[VInfo.nvtcs_loc] = nsuper_loc; - Pslu_freeable->nvtcs_loc = VInfo.nvtcs_loc; - - /* set up xsup data */ - Pslu_freeable->lsub = Llu_symbfact.lsub; - Pslu_freeable->xlsub = Llu_symbfact.xlsub; - Pslu_freeable->usub = Llu_symbfact.usub; - Pslu_freeable->xusub = Llu_symbfact.xusub; - Pslu_freeable->szLsub = Llu_symbfact.szLsub; - Pslu_freeable->szUsub = Llu_symbfact.szUsub; - -#if ( PROFlevel>=1 ) - t_symbFact_loc[1] = SuperLU_timer_() - t_symbFact_loc[1]; -#endif - -#if ( PRNTlevel>=1 ) - estimate_memUsage (n, iam, symb_mem_usage, - &totalMemLU, &overestimMem, - Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS); - stat_loc[0] = (float) nnzL; - stat_loc[1] = (float) nnzU; - stat_loc[2] = (float) nsuper_loc; - stat_loc[3] = (float) Pslu_freeable->xlsub[VInfo.nvtcs_loc]; - stat_loc[4] = (float) Pslu_freeable->xusub[VInfo.nvtcs_loc]; - stat_loc[5] = totalMemLU; - stat_loc[6] = overestimMem; - stat_loc[7] = totalMemLU - overestimMem; - stat_loc[8] = (float) PS.maxSzBuf; - stat_loc[9] = (float) PS.nDnsUpSeps; - stat_loc[10] = (float) PS.nDnsCurSep; - stat_loc[11] = (float) (Llu_symbfact.no_expand + Llu_symbfact.no_expcp + - Llu_symbfact.no_expand_pr); - stat_loc[12] = (float) Llu_symbfact.no_expand; - stat_loc[13] = (float) Llu_symbfact.no_expcp; - stat_loc[14] = (float) Llu_symbfact.no_expand_pr; - stat_loc[15] = (float) fill_rcmd; - stat_loc[16] = PS.nops; - stat_loc[17] = PS.fill_pelt[1]; - stat_loc[18] = PS.fill_pelt[4]; - stat_loc[19] = PS.fill_pelt[0]; - stat_loc[20] = PS.fill_pelt[2]; - stat_loc[21] = PS.fill_pelt[3]; - stat_loc[22] = PS.fill_pelt[5]; - - MPI_Reduce (stat_loc, stat_glob, 23, MPI_FLOAT, - MPI_SUM, 0, (*symb_comm)); - MPI_Reduce (&(stat_loc[5]), mem_glob, 14, MPI_FLOAT, - MPI_MAX, 0, (*symb_comm)); - fill_rcmd = (int_t) mem_glob[10]; - PS.fill_pelt[0] = stat_glob[19]; - PS.fill_pelt[1] = mem_glob[12]; - PS.fill_pelt[2] = stat_glob[20]; - PS.fill_pelt[3] = stat_glob[21]; - PS.fill_pelt[4] = mem_glob[13]; - PS.fill_pelt[5] = stat_glob[22]; - if (PS.fill_pelt[2] == 0.) PS.fill_pelt[2] = 1.; - if (PS.fill_pelt[5] == 0.) PS.fill_pelt[5] = 1.; - -#if ( PROFlevel>=1 ) - MPI_Reduce (t_symbFact_loc, t_symbFact, 3, MPI_DOUBLE, - MPI_MAX, 0, (*symb_comm)); - MPI_Gather (time_lvls, 3 * nlvls, MPI_DOUBLE, - time_lvlsT, 3 * nlvls , MPI_DOUBLE, - 0, (*symb_comm)); -#endif - - stat_msgs_l[0] = (float) PS.maxsz_msgSnd; - stat_msgs_l[1] = (float) PS.maxsz_msgSnd; - if (PS.maxsz_msgSnd < PS.maxsz_msgCol) - stat_msgs_l[1] = PS.maxsz_msgCol; - stat_msgs_l[2] = PS.no_shmSnd + PS.no_msgsSnd + - PS.no_shmRcvd + PS.no_msgsRcvd; - stat_msgs_l[3] = stat_msgs_l[2] + PS.no_msgsCol; - stat_msgs_l[4] = stat_msgs_l[2]; - stat_msgs_l[5] = stat_msgs_l[3]; - stat_msgs_l[6] = PS.no_msgsSnd; - stat_msgs_l[7] = PS.no_msgsSnd + PS.no_msgsCol; - stat_msgs_l[8] = PS.sz_msgsSnd; - stat_msgs_l[9] = PS.sz_msgsSnd + PS.sz_msgsCol; - MPI_Reduce (stat_msgs_l, stat_msgs_g, 4, MPI_FLOAT, - MPI_MAX, 0, (*symb_comm)); - MPI_Reduce (&(stat_msgs_l[4]), &(stat_msgs_g[4]), 6, MPI_FLOAT, - MPI_SUM, 0, (*symb_comm)); - if (stat_msgs_g[6] == 0) stat_msgs_g[6] = 1; - if (stat_msgs_g[7] == 0) stat_msgs_g[7] = 1; - - if (!iam) { - nnzL = (int_t) stat_glob[0]; nnzU = (int_t) stat_glob[1]; - nsuper = (int_t) stat_glob[2]; - szLGr = (int_t) stat_glob[3]; szUGr = (int_t) stat_glob[4]; - printf("\tMax szBlk %ld\n", VInfo.maxSzBlk); -#if ( PRNTlevel>=2 ) - printf("\t relax_gen %.2f, relax_curSep %.2f, relax_seps %.2f\n", - PS.relax_gen, PS.relax_curSep, PS.relax_seps); -#endif - printf("\tParameters: fill mem %ld fill pelt %ld\n", - sp_ienv_dist(6), PS.fill_par); - printf("\tNonzeros in L %ld\n", nnzL); - printf("\tNonzeros in U %ld\n", nnzU); - printf("\tnonzeros in L+U-I %ld\n", - nnzL + nnzU); - printf("\tNo of supers %ld\n", nsuper); - printf("\tSize of G(L) %ld\n", szLGr); - printf("\tSize of G(U) %ld\n", szUGr); - printf("\tSize of G(L+U) %ld\n", szLGr+szUGr); - - printf("\tParSYMBfact (MB) :\tL\\U MAX %.2f\tAVG %.2f\n", - mem_glob[0]*1e-6, - stat_glob[5]/nprocs_symb*1e-6); -#if ( PRNTlevel>=2 ) - printf("\tRL overestim (MB):\tL\\U MAX %.2f\tAVG %.2f\n", - mem_glob[1]*1e-6, - stat_glob[6]/nprocs_symb*1e-6); - printf("\tsnd/rcv buffers (MB):\tL\\U MAX %.2f\tAVG %.2f\n", - mem_glob[3]*1e-6, - stat_glob[8]/nprocs_symb*1e-6); - printf("\tSYMBfact 2*n+4*nvtcs_loc+2*maxNvtcsNds_loc:\tL\\U %.2f\n", - (float) (2 * n * sizeof(int_t)) *1e-6); - printf("\tint_t %d, int %d, long int %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(int), sizeof(long int), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tDNS ALLSEPS:\t MAX %d\tAVG %.2f\n", - (int_t) mem_glob[4], stat_glob[9]/nprocs_symb); - printf("\tDNS CURSEP:\t MAX %d\tAVG %.2f\n\n", - (int_t) mem_glob[5], stat_glob[10]/nprocs_symb); - - printf("\t MAX FILL Mem(L+U) / Mem(A) per processor %ld\n", fill_rcmd); - printf("\t Per elt MAX %ld AVG %ld\n", - (int_t) PS.fill_pelt[4], (int_t)(PS.fill_pelt[3]/PS.fill_pelt[5])); - printf("\t Per elt RL MAX %ld AVG %ld\n", - (int_t) PS.fill_pelt[1], (int_t)(PS.fill_pelt[0]/PS.fill_pelt[2])); - printf("\tM Nops:\t MAX %.2f\tAVG %.2f\n", - mem_glob[11]*1e-6, (stat_glob[16]/nprocs_symb)*1e-6); - - - printf("\tEXPANSIONS: MAX/AVG\n"); - printf("\tTOTAL: %d / %.2f\n", - (int_t) mem_glob[6], stat_glob[11]/nprocs_symb); - printf("\tREALLOC: %.f / %.2f RL_CP %.f / %.2f PR_CP %.f / %.2f\n", - mem_glob[7], stat_glob[12]/nprocs_symb, - mem_glob[8], stat_glob[13]/nprocs_symb, - mem_glob[9], stat_glob[14]/nprocs_symb); - - printf ("\n\tDATA MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n", - stat_msgs_g[2]*1e-3, stat_msgs_g[4]/nprocs_symb*1e-3, - stat_msgs_g[0]*1e-6, stat_msgs_g[8] / stat_msgs_g[6]*1e-6); - printf ("\tTOTAL MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n", - stat_msgs_g[3]*1e-3, stat_msgs_g[5]/nprocs_symb*1e-3, - stat_msgs_g[1]*1e-6, stat_msgs_g[9]/stat_msgs_g[7]*1e-6); -#endif - -#if ( PROFlevel>=1 ) - printf("Distribute matrix time = %8.3f\n", t_symbFact[0]); - printf("Count vertices time = %8.3f\n", t_symbFact[2]); - printf("Symbfact DIST time = %8.3f\n", t_symbFact[1]); - - printf("\nLvl\t Time\t Init\t Inter\t Intra\n"); - time_lvlsg[0] = 0.; - for (i = 0; i < nlvls; i++) { - for (j = 1; j < 9; j++) - time_lvlsg[j] = 0.; - for (p = 0; p < nprocs_symb; p++) { - k = p * 3 * nlvls; - t = time_lvlsT[i*3+k] + time_lvlsT[i*3+k+1] + time_lvlsT[i*3+k+2]; - if (t > time_lvlsg[1]) { - time_lvlsg[1] = t; j = p; - } - time_lvlsg[2] += t; - if (time_lvlsT[i*3+k] > time_lvlsg[3]) - time_lvlsg[3] = time_lvlsT[i*3+k]; - time_lvlsg[4] += time_lvlsT[i*3+k]; - if (time_lvlsT[i*3+k+1] > time_lvlsg[5]) - time_lvlsg[5] = time_lvlsT[i*3+k+1]; - time_lvlsg[6] += time_lvlsT[i*3+k+1]; - if (time_lvlsT[i*3+k+2] > time_lvlsg[7]) - time_lvlsg[7] = time_lvlsT[i*3+k+2]; - time_lvlsg[8] += time_lvlsT[i*3+k+2]; - } - time_lvlsg[0] += time_lvlsg[1]; - printf ("%d \t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\n", i, - time_lvlsg[1], time_lvlsg[2] / nprocs_symb, - time_lvlsg[3], time_lvlsg[4] / nprocs_symb, - time_lvlsg[5], time_lvlsg[6] /nprocs_symb, - time_lvlsg[7], time_lvlsg[8] / nprocs_symb); - } - printf("\t %8.3f \n", time_lvlsg[0]); -#endif - } -#endif -#if ( PROFlevel>=1 ) - SUPERLU_FREE (time_lvls); - SUPERLU_FREE (time_lvlsT); -#endif - symbfact_free (iam, nprocs_symb, &Llu_symbfact, &VInfo, &CS); - } /* if (iam < nprocs_symb) */ - else { - /* update Pslu_freeable before returning */ - Pslu_freeable->nvtcs_loc = 0; - Pslu_freeable->xlsub = NULL; Pslu_freeable->lsub = NULL; - Pslu_freeable->xusub = NULL; Pslu_freeable->usub = NULL; - Pslu_freeable->supno_loc = NULL; - Pslu_freeable->xsup_beg_loc = NULL; - Pslu_freeable->xsup_end_loc = NULL; - - SUPERLU_FREE( tempArray ); - PS.allocMem -= n * sizeof(int_t); - } - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit psymbfact()"); -#endif - - return (- PS.allocMem); -} /* SYMBFACT_DIST */ - - -static int_t -initParmsAndStats -( - psymbfact_stat_t *PS /* Output -statistics*/ -) -/* - * Purpose - * ======= - * Initialize relaxation parameters and statistics variables - */ -{ - int i; - - PS->nDnsCurSep = 0; - PS->nDnsUpSeps = 0; - - PS->relax_gen = 1.0; - PS->relax_curSep = 1.0; - PS->relax_seps = 1.0; - PS->fill_par = sp_ienv_dist(6); - PS->nops = 0.; - PS->no_shmSnd = 0.; - PS->no_msgsSnd = 0.; - PS->maxsz_msgSnd = 0; - PS->sz_msgsSnd = 0.; - PS->no_shmRcvd = 0.; - PS->no_msgsRcvd = 0.; - PS->maxsz_msgRcvd = 0; - PS->sz_msgsRcvd = 0.; - PS->no_msgsCol = 0.; - PS->maxsz_msgCol = 0; - PS->sz_msgsCol = 0.; - - for (i = 0; i < 6; i++) - PS->fill_pelt[i] = 0.; - - PS->estimUSz = 0; - PS->estimLSz = 0; - PS->maxSzLPr = 0; - PS->maxSzUPr = 0; - PS->maxSzBuf = 0; - PS->szDnsSep = 0; - PS->allocMem = 0; -} - -static float -cntsVtcs -( - int_t n, /* Input - order of the input matrix */ - int iam, /* Input - my processor number */ - int nprocs_symb, /* Input - no of processors for symbolic factorization */ - Pslu_freeable_t *Pslu_freeable, /* Input -globToLoc and maxNvtcsPProc */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output -local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - int_t *tempArray, /* Input - temporary storage */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t *sizes, /* Input - sizes of each node in the tree */ - psymbfact_stat_t *PS, /* Input/Output -statistics */ - MPI_Comm *commLvls - ) -/* - * Purpose - * ======= - * - * Computes an estimation of the number of elements in columns of L - * and rows of U. Stores this information in cntelt_vtcs, and it will - * be used in the right-looking symbolic factorization. - */ -{ - int fstP, lstP, szSep, npNode, i, j; - int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk; - int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk; - int_t nelts, nelts_new_blk; - int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc; - int_t *minElt_vtx, *cntelt_vtcs; - - /* Initialization */ - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - cntelt_vtcs = Llu_symbfact->cntelt_vtcs; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = VInfo->nvtcs_loc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n) - minElt_vtx = lsub; - else { - /* allocate memory for minElt_vtx */ - if (!(minElt_vtx = intMalloc_dist(n))) { - fprintf(stderr, "Malloc fails for minElt_vtx[]."); - return (PS->allocMem); - } - PS->allocMem += n * sizeof (int_t); - } - - for (ii = 0; ii < n; ii++) - tempArray[ii] = n; - for (ii = 0; ii < nvtcs_loc; ii++) - cntelt_vtcs[ii] = 0; - - szSep = nprocs_symb; - i = 0; - cur_blk = 0; - vtx_lid = 0; - while (szSep >= 1) { - /* for each level in the separator tree */ - npNode = nprocs_symb / szSep; - fstP = 0; - /* for each node in the level */ - for (j = i; j < i + szSep; j++) { - fstVtx = fstVtxSep[j]; - lstVtx = fstVtx + sizes[j]; - lstP = fstP + npNode; - - if (fstP <= iam && iam < lstP) { - ind_blk = cur_blk; - ii = vtx_lid; - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) { - for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) { - vtx_elt = lsub[jj]; - if (tempArray[vtx_elt] == n) { - tempArray[vtx_elt] = vtx; - } - } - for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) { - vtx_elt = usub[jj]; - if (tempArray[vtx_elt] == n) { - tempArray[vtx_elt] = vtx; - } - } - } - } - if (szSep == nprocs_symb) - vtx_lid = ii; - else { - MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), - (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int) log2( (double) npNode )); - PS->sz_msgsCol += (float) (n - fstVtx); - if (PS->maxsz_msgCol < n - fstVtx) - PS->maxsz_msgCol = n - fstVtx; -#endif - - nelts = 0; - for (ii = fstVtx; ii < lstVtx; ii++) - tempArray[ii] = 0; - for (ii = fstVtx; ii < n; ii++) { - if (minElt_vtx[ii] != n) { - if (minElt_vtx[ii] < fstVtx) - nelts ++; - else - tempArray[minElt_vtx[ii]] ++; - if (ii > lstVtx) - tempArray[ii] = minElt_vtx[ii]; - } - } - - ind_blk = cur_blk; - lv = fstVtx; - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - - for (ii = lv; ii < fstVtx_blk; ii++) - nelts += tempArray[ii]; - lv = lstVtx_blk; - - nelts_new_blk = 0; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - nelts_new_blk += tempArray[vtx]; - cntelt_vtcs[vtx_lid] = nelts; - } - nelts += nelts_new_blk; - } - } /* if (szSep != nprocs_symb) */ - cur_blk = ind_blk; - } - fstP += npNode; - } - i += szSep; - szSep = szSep / 2; - } - /* free memory */ - if (minElt_vtx != lsub) { - SUPERLU_FREE (minElt_vtx); - PS->allocMem -= n * sizeof(int_t); - } - return (SUCCES_RET); -} - -static float -symbfact_mapVtcs -( - int iam, /* Input -process number */ - int nprocs_num, /* Input -number of processors */ - int nprocs_symb, /* Input -number of procs for symbolic factorization */ - SuperMatrix *A, /* Input -input distributed matrix A */ - int_t *fstVtxSep, /* Input -first vertex in each separator */ - int_t *sizes, /* Input -size of each separator in the separator tree */ - Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc - computed */ - vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */ - int_t *tempArray, /* Input -temp array of size n = order of the matrix */ - int_t maxSzBlk, /* Input -maximum number of vertices in a block */ - psymbfact_stat_t *PS /* Input/Output -statistics */ - ) -{ -/* - * Purpose - * ======= - * - * symbfact_mapVtcs maps the vertices of the graph of the input - * matrix A on nprocs_symb processors, using the separator tree - * returned by a graph partitioning algorithm from the previous step - * of the symbolic factorization. The number of processors - * nprocs_symb must be a power of 2. - * - * Description of the algorithm - * ============================ - * - * A subtree to subcube algorithm is used first to map the processors - * on the nodes of the separator tree. - * - * For each node of the separator tree, its corresponding vertices - * are distributed on the processors affected to this node, using a - * block cyclic distribution. - * - * After the distribution, fields of the VInfo structure are - * computed. The array globToLoc and maxNvtcsPProc of Pslu_freeable - * are also computed. - * - */ - int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d; - int_t k, n, kk; - int_t fstVtx, lstVtx; - int_t fstVtxBlk, ind_blk; - int_t noVtcsProc, noBlk; - int_t nvtcs_loc; /* number of vertices owned by process iam */ - int_t nblks_loc; /* no of blocks owned by process iam */ - int_t *globToLoc; /* global indexing to local indexing */ - int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx; - int_t *begEndBlks_loc; /* begin and end vertex of each local block */ - int_t *vtcs_pe; /* contains the number of vertices on each processor */ - int *avail_pes; /* contains the processors to be used at each level */ - - n = A->ncol; - /* allocate memory */ - if (!(globToLoc = intMalloc_dist(n + 1))) { - fprintf (stderr, "Malloc fails for globToLoc[]."); - return (PS->allocMem); - } - PS->allocMem += (n+1) * sizeof(int_t); - if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) { - fprintf (stderr, "Malloc fails for avail_pes[]."); - return (PS->allocMem); - } - PS->allocMem += nprocs_symb*sizeof(int); - if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) { - fprintf (stderr, "Malloc fails for vtcs_pe[]."); - return (PS->allocMem); - } - PS->allocMem += nprocs_symb*sizeof(int_t); - - /* Initialization */ - globToLoc[n] = n; - for (p = 0; p < nprocs_symb; p++) { - vtcs_pe[p] = 0; - avail_pes[p] = EMPTY; - } - nvtcs_loc = 0; - nblks_loc = 0; - maxNvtcsNds_loc = 0; - maxNeltsVtx = 0; - - /* distribute data among processors */ - szSep = nprocs_symb; - iSep = 0; - while (szSep >= 1) { - /* for each level in the separator tree */ - npNode = nprocs_symb / szSep; - firstP = 0; - nvtcsNds_loc = 0; - - for (jSep = iSep; jSep < iSep + szSep; jSep++) { - /* for each node in the level */ - fstVtx = fstVtxSep[jSep]; - lstVtx = fstVtx + sizes[jSep]; - if (firstP <= iam && iam < firstP + npNode) - maxNeltsVtx += lstVtx - fstVtx; - - if (szSep == nprocs_symb) { - /* leaves of the separator tree */ - for (k = fstVtx; k < lstVtx; k++) { - globToLoc[k] = (int_t) firstP; - vtcs_pe[firstP] ++; - } - if (firstP == iam) { - nvtcs_loc += lstVtx - fstVtx; - nblks_loc ++; - } - } - else { - /* superior levels of the separator tree */ - k = fstVtx; - noVtcsProc = maxSzBlk; - fstVtxBlk = fstVtx; - if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode; - /* first allocate processors from previous levels */ - for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) { - p = avail_pes[ind_ap_s]; - if (p != EMPTY && k < lstVtx) { - /* for each column in the separator */ - avail_pes[ind_ap_s] = EMPTY; - kk = 0; - while (kk < noVtcsProc && k < lstVtx) { - globToLoc[k] = p; - vtcs_pe[p] ++; - k ++; - kk ++; - } - if (p == iam) { - nvtcs_loc += kk; - nblks_loc ++; - nvtcsNds_loc += kk; - } - } - else { - if (p != EMPTY && k == lstVtx) { - avail_pes[ind_ap_s] = EMPTY; - avail_pes[ind_ap_d] = p; ind_ap_d ++; - } - } - } - noBlk = 0; - p = firstP + npNode; - while (k < lstVtx) { - /* for each column in the separator */ - kk = 0; - p = (int) (noBlk % (int_t) npNode) + firstP; - while (kk < noVtcsProc && k < lstVtx) { - globToLoc[k] = p; - vtcs_pe[p] ++; - k ++; - kk ++; - } - if (p == iam) { - nvtcs_loc += kk; - nblks_loc ++; - nvtcsNds_loc += kk; - } - noBlk ++; - } /* while (k < lstVtx) */ - /* Add the unused processors to the avail_pes list of pes */ - for (p = p + 1; p < firstP + npNode; p ++) { - avail_pes[ind_ap_d] = p; ind_ap_d ++; - } - } - firstP += npNode; - } - if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb) - maxNvtcsNds_loc = nvtcsNds_loc; - iSep += szSep; - szSep = szSep / 2; - } - -#if ( PRNTlevel>=2 ) - if (!iam) - PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe); -#endif - /* determine maximum number of vertices among processors */ - maxNvtcsPProc = vtcs_pe[0]; - vtcs_pe[0] = 0; - for (p = 1; p < nprocs_symb; p++) { - if (maxNvtcsPProc < vtcs_pe[p]) - maxNvtcsPProc = vtcs_pe[p]; - vtcs_pe[p] = 0; - } -#if ( PRNTlevel>=2 ) - if (!iam) - printf (" MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", - maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n); -#endif - - if (iam < nprocs_symb) - if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1))) - ABORT("Malloc fails for begEndBlks_loc[]."); - - ind_blk = 0; - k = 0; - while (k < n) { - p = globToLoc[k]; - if (p == iam) - begEndBlks_loc[ind_blk] = k; - while (globToLoc[k] == p && k < n) { - globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p]; - vtcs_pe[p] ++; - k ++; - } - if (p == iam) { - begEndBlks_loc[ind_blk + 1] = k; - ind_blk += 2; - } - } - if (iam < nprocs_symb) - begEndBlks_loc[2 * nblks_loc] = n; - - SUPERLU_FREE (avail_pes); - - Pslu_freeable->maxNvtcsPProc = maxNvtcsPProc; - Pslu_freeable->globToLoc = globToLoc; - if (iam < nprocs_symb) { - VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc; - VInfo->nblks_loc = nblks_loc; - VInfo->nvtcs_loc = nvtcs_loc; - VInfo->curblk_loc = 0; - VInfo->maxNeltsVtx = maxNeltsVtx; - VInfo->filledSep = FALSE; - VInfo->xlsub_nextLvl = 0; - VInfo->xusub_nextLvl = 0; - VInfo->begEndBlks_loc = begEndBlks_loc; - VInfo->fstVtx_nextLvl = begEndBlks_loc[0]; - } - return SUCCES_RET; -} - -static void -symbfact_distributeMatrix -( - int iam, /* Input - my processor number */ - int nprocs_num, /* Input - number of processors */ - int nprocs_symb, /* Input - number of processors for the - symbolic factorization */ - SuperMatrix *A, /* Input - input matrix A */ - int_t *perm_c, /* Input - column permutation */ - int_t *perm_r, /* Input - row permutation */ - matrix_symbfact_t *AS, /* Output - temporary storage for the - redistributed matrix */ - Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices - distribution */ - int_t *tempArray, /* Input/Output - temporary array of size n - (order of the matrix) */ - MPI_Comm *num_comm /* Input - communicator for nprocs_num procs */ - ) -{ -/* - * Purpose - * ======= - * - * Distribute input matrix A for the symbolic factorization routine. - * Only structural information is distributed. The redistributed - * matrix has its rows and columns permuted according to perm_r and - * perm_c. A is not modified during this routine. - * - */ -/* Notations: - * Ainf : inferior part of A, including diagonal. - * Asup : superior part of A. - */ - int p, p_irow, code_err, ainf_data; - int_t n, m_loc, fst_row; - int_t i, j, k, irow, jcol; - NRformat_loc *Astore; - int_t nnz_loc, nnz_iam; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be received */ - /* number of nonzeros to send/receive per processor */ - int_t *nnzToSend, *nnzToRecv; - int_t *nnzAinf_toSnd; /* nnz in Ainf to send */ - /* VInfo data structures */ - int_t *globToLoc, *begEndBlks_loc, nblks_loc, nvtcs_loc, maxNvtcsPProc; - - int_t neltsRow, vtx, vtx_lid, nelts, ind; - int_t *snd_aind, *rcv_aind; - int_t *ptr_toSnd, *buf, *ptr_toRcv; - /* matrix_symbfact_t *As data */ - int_t *x_ainf, *x_asup, *ind_ainf, *ind_asup; - int *intBuf1, *intBuf2, *intBuf3, *intBuf4; - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - nnzToRecv = intCalloc_symbfact(3 * (int_t)nprocs_num); - nnzToSend = nnzToRecv + nprocs_num; - nnzAinf_toSnd = nnzToRecv + 2 * nprocs_num; - - /* --------------------------------------------------------------------- - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE - SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. - ----------------------------------------------------------------------*/ - /* tempArray stores the number of nonzeros in each column of ainf */ - for (i = 0; i < n; i++) - tempArray[i] = 0; - for (i = 0; i < m_loc; i++) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - p_irow = OWNER(globToLoc[irow]); - neltsRow = 0; - - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) { - jcol = perm_c[Astore->colind[j]]; - if (jcol <= irow) { - p = OWNER(globToLoc[jcol]); - if (tempArray[jcol] == 0) { - nnzToSend[p] += 2; - nnzAinf_toSnd[p] += 2; - } - tempArray[jcol] ++; - nnzAinf_toSnd[p] ++; - } - else { - p = p_irow; - neltsRow ++; - } - nnzToSend[p] ++; - } - if (neltsRow != 0) { - nnzToSend[p_irow] += 2; - } - } - - /* add one entry which will separate columns of Ainf from rows - of Asup */ - for (p = 0; p < nprocs_num; p++) - if (nnzToSend[p] != 0) - nnzToSend[p] ++; - - /* All-to-all communication */ - MPI_Alltoall (nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - (*num_comm)); - - nnz_loc = SendCnt = RecvCnt = 0; - for (p = 0; p < nprocs_num; p++) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - } else { - nnz_loc += nnzToRecv[p]; - nnzToSend[p] = 0; - } - } - nnz_iam = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if (!(snd_aind = intMalloc_symbfact(SendCnt)) && SendCnt != 0) - ABORT("Malloc fails for snd_aind[]."); - if ( !(rcv_aind = intMalloc_symbfact(nnz_iam + 1))) - ABORT("Malloc fails for rcv_aind[]."); - if ( !(ptr_toSnd = intCalloc_symbfact((int_t) nprocs_num)) ) - ABORT("Malloc fails for ptr_toSnd[]."); - if ( !(ptr_toRcv = intCalloc_symbfact((int_t) nprocs_num)) ) - ABORT("Malloc fails for ptr_toRcv[]."); - - /* setup ptr_toSnd[p] to point to data in snd_aind to be send to - processor p */ - for (i = 0, j = 0, p = 0; p < nprocs_num; p++) { - if ( p != iam ) - ptr_toSnd[p] = i; - else - ptr_toSnd[p] = j; - i += nnzToSend[p]; - j += nnzToRecv[p]; - } - - for (i = 0; i < n; i++) { - if (tempArray[i] != 0) { - /* column i of Ainf will be send to a processor */ - p = OWNER( globToLoc[i] ); - if (p == iam) { - buf = &(rcv_aind[ptr_toSnd[p]]); - } - else { - buf = &(snd_aind[ptr_toSnd[p]]); - } - buf[0] = tempArray[i]; - buf[1] = i; - tempArray[i] = ptr_toSnd[p] + 2; - ptr_toSnd[p] += 2 + buf[0]; - } - } - - /* set ptr_toSnd to point to Asup data (stored by rows) */ - for (i = 0, j = 0, p = 0; p < nprocs_num; p++) { - if ( p != iam ) { - if (nnzToSend[p] != 0) { - snd_aind[i + nnzAinf_toSnd[p]] = EMPTY; - ptr_toSnd[p] = i + nnzAinf_toSnd[p] + 1; - } - } - else { - if (nnzToRecv[p] != 0) { - rcv_aind[j + nnzAinf_toSnd[p]] = EMPTY; - ptr_toSnd[p] = j + nnzAinf_toSnd[p] + 1; - } - } - i += nnzToSend[p]; - j += nnzToRecv[p]; - } - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE snd_aind STRUCTURE TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - For each processor, we store first the columns to be sent, - and then the rows to be sent. For each row/column sent: - entry 0 : x = number of elements in that row/column - entry 1 : row/column number - entries 2 .. x + 2 : row/column indices. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; i++) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*A */ - p_irow = OWNER( globToLoc[irow] ); - ptr_toSnd[p_irow] +=2; - neltsRow = 0; - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) { - jcol = perm_c[Astore->colind[j]]; - if (jcol <= irow) { - p = OWNER( globToLoc[jcol] ); - k = tempArray[jcol]; - tempArray[jcol] ++; - if (p == iam) { /* local */ - rcv_aind[k] = irow; - } - else { - snd_aind[k] = irow; - } - } - else { - p = p_irow; - neltsRow ++; - k = ptr_toSnd[p]; - ptr_toSnd[p] ++; - if (p == iam) { /* local */ - rcv_aind[k] = jcol; - } - else { - snd_aind[k] = jcol; - } - } - } - - if (neltsRow == 0) - ptr_toSnd[p_irow] -= 2; - else { - /* store entry 0 and entry 1 */ - if (p_irow == iam) { /* local */ - rcv_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow; - rcv_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow; - } - else { /* remote */ - snd_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow; - snd_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow; - } - } - } - - /* reset ptr_toSnd to point to the beginning of the data for - each processor (structure needed in MPI_Alltoallv */ - for (i = 0, j = 0, p = 0; p < nprocs_num; p++) { - ptr_toSnd[p] = i; - i += nnzToSend[p]; - ptr_toRcv[p] = j; - j += nnzToRecv[p]; - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - Note: it uses MPI_Alltoallv. - ------------------------------------------------------------*/ - if (nprocs_num > 1) { -#if defined (_LONGINT) - intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs_num * sizeof(int)); - intBuf2 = intBuf1 + nprocs_num; - intBuf3 = intBuf1 + 2 * nprocs_num; - intBuf4 = intBuf1 + 3 * nprocs_num; - - for (p=0; p INT_MAX || ptr_toSnd[p] > INT_MAX || - nnzToRecv[p] > INT_MAX || ptr_toRcv[p] > INT_MAX) - ABORT("ERROR in symbfact_distributeMatrix size to send > INT_MAX\n"); - intBuf1[p] = (int) nnzToSend[p]; - intBuf2[p] = (int) ptr_toSnd[p]; - intBuf3[p] = (int) nnzToRecv[p]; - intBuf4[p] = (int) ptr_toRcv[p]; - } -#else /* Default */ - intBuf1 = nnzToSend; intBuf2 = ptr_toSnd; - intBuf3 = nnzToRecv; intBuf4 = ptr_toRcv; -#endif - - MPI_Alltoallv (snd_aind, intBuf1, intBuf2, mpi_int_t, - rcv_aind, intBuf3, intBuf4, mpi_int_t, - (*num_comm)); -#if defined (_LONGINT) - SUPERLU_FREE (intBuf1); -#endif - } - - /* ------------------------------------------------------------ - DEALLOCATE SEND STORAGE - ------------------------------------------------------------*/ - if (snd_aind) SUPERLU_FREE( snd_aind ); - SUPERLU_FREE( ptr_toSnd ); - - /* ------------------------------------------------------------ - CONVERT THE RECEIVED FORMAT INTO THE SYMBOLIC FORMAT. - THIS IS PERFORMED ONLY BY NPROCS_SYMB PROCESSORS - ------------------------------------------------------------*/ - if (iam < nprocs_symb) { - nblks_loc = VInfo->nblks_loc; - begEndBlks_loc = VInfo->begEndBlks_loc; - nvtcs_loc = VInfo->nvtcs_loc; - /* ------------------------------------------------------------ - Allocate space for storing indices of A after redistribution. - ------------------------------------------------------------*/ - if (!(x_ainf = intCalloc_symbfact (nvtcs_loc + 1))) - ABORT("Malloc fails for x_ainf[]."); - if (!(x_asup = intCalloc_symbfact (nvtcs_loc + 1))) - ABORT("Malloc fails for x_asup[]."); - - /* Initialize the array of columns/rows pointers */ - for (i = 0, p = 0; p < nprocs_num; p++) { - ainf_data = TRUE; - k = 0; - while (k < nnzToRecv[p]) { - j = rcv_aind[i + k]; - if (j == EMPTY) { - ainf_data = FALSE; - k ++; - } - else { - nelts = rcv_aind[i + k]; - vtx = rcv_aind[i + k + 1]; - vtx_lid = LOCAL_IND( globToLoc[vtx] ); - k += nelts + 2; - if (ainf_data) - x_ainf[vtx_lid] += nelts; - else - x_asup[vtx_lid] = nelts; - } - } - i += nnzToRecv[p]; - } - - /* copy received information */ - vtx_lid = 0; - for (i = 0, k = 0, j = 0; i < nblks_loc; i++) { - for (vtx = begEndBlks_loc[2*i]; vtx < begEndBlks_loc[2*i+1]; vtx++, vtx_lid ++) { - nelts = x_ainf[vtx_lid]; - x_ainf[vtx_lid] = k; - k += nelts; - nelts = x_asup[vtx_lid]; - x_asup[vtx_lid] = j; - j += nelts; - tempArray[vtx] = x_ainf[vtx_lid]; - } - } - x_ainf[nvtcs_loc] = k; - x_asup[nvtcs_loc] = j; - - /* Allocate space for storing indices of A after conversion */ - if ( !(ind_ainf = intMalloc_symbfact(x_ainf[nvtcs_loc])) && x_ainf[nvtcs_loc] != 0 ) - ABORT("Malloc fails for ind_ainf[]."); - if ( !(ind_asup = intMalloc_symbfact(x_asup[nvtcs_loc])) && x_asup[nvtcs_loc] != 0) - ABORT("Malloc fails for ind_asup[]."); - - /* Copy the data into the row/column oriented storage */ - for (i = 0, p = 0; p < nprocs_num; p++) { - ainf_data = TRUE; - k = 0; - while (k < nnzToRecv[p]) { - j = rcv_aind[i + k]; - if (ainf_data && j == EMPTY) { - ainf_data = FALSE; - k ++; - } - else { - nelts = rcv_aind[i + k]; - vtx = rcv_aind[i + k + 1]; - vtx_lid = LOCAL_IND( globToLoc[vtx] ); - if (ainf_data) { - /* traverse ainf data */ - ind = tempArray[vtx]; - for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) - ind_ainf[ind] = rcv_aind[j]; - tempArray[vtx] = ind; - } - else { - /* traverse asup data */ - ind = x_asup[vtx_lid]; - for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) - ind_asup[ind] = rcv_aind[j]; - } - k += nelts + 2; - } - } - i += nnzToRecv[p]; - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - if (nprocs_symb > 1) { - SUPERLU_FREE( ptr_toRcv ); - if (rcv_aind) SUPERLU_FREE( rcv_aind ); - if (nnzToRecv) SUPERLU_FREE( nnzToRecv ); - } - - AS->x_ainf = x_ainf; - AS->x_asup = x_asup; - AS->ind_ainf = ind_ainf; - AS->ind_asup = ind_asup; - - VInfo->nnz_asup_loc = x_asup[nvtcs_loc]; - VInfo->nnz_ainf_loc = x_ainf[nvtcs_loc]; - } -} - -static -float allocPrune_lvl -( - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data - structures */ - vtcsInfo_symbfact_t *VInfo, /* Input -local info on vertices - distribution */ - psymbfact_stat_t *PS /* Input -statistics */ - ) -/* - * Allocate storage for data structures necessary for pruned graphs. - * For those unpredictable size, make a guess as FILL * n. - * Return value: - * 0 if enough memory was available; - * otherwise, return the amount of space intended to allocate - * when memory allocation failure occurred. - */ -{ - int_t lword; - int_t nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr; - int_t nvtcs_loc, no_expand_pr, x_sz; - float alpha = 1.5; - int_t FILL = sp_ienv_dist(6); - - nvtcs_loc = VInfo->nvtcs_loc; - - no_expand_pr = 0; - lword = (int_t) sizeof(int_t); - - /* free memory allocated for the domain symbolic factorization */ - if (Llu_symbfact->szLsubPr) - SUPERLU_FREE( Llu_symbfact->lsubPr ); - if (Llu_symbfact->szUsubPr) - SUPERLU_FREE( Llu_symbfact->usubPr ); - if (Llu_symbfact->xlsubPr) - SUPERLU_FREE( Llu_symbfact->xlsubPr ); - if (Llu_symbfact->xusubPr) - SUPERLU_FREE( Llu_symbfact->xusubPr ); - - Llu_symbfact->xlsub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1); - Llu_symbfact->xusub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1); - - /* allocate memory to use during superior levels of sep_tree */ - x_sz = SUPERLU_MIN( VInfo->maxNvtcsNds_loc, VInfo->maxSzBlk); - nzlmaxPr = 2 * FILL * VInfo->maxNvtcsNds_loc; - nzumaxPr = 2 * FILL * VInfo->maxSzBlk; - - /* Integer pointers for L\U factors */ - if (x_sz != 0) { - xlsubPr = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1); - xusubPr = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1); - - lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword); - usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword); - - while ( !lsubPr || !usubPr ) { - if (!lsubPr) SUPERLU_FREE(lsubPr); - if (!usubPr) SUPERLU_FREE(usubPr); - - nzlmaxPr /= 2; nzlmaxPr = alpha * (float) nzlmaxPr; - nzumaxPr /= 2; nzumaxPr = alpha * (float) nzumaxPr; - - if ( nzumaxPr < x_sz ) { - fprintf(stderr, "Not enough memory to perform factorization.\n"); - return (PS->allocMem); - } - lsubPr = (int_t *) SUPERLU_MALLOC(nzlmaxPr * lword); - usubPr = (int_t *) SUPERLU_MALLOC(nzumaxPr * lword); - ++no_expand_pr; - } - } - else { - xlsubPr = NULL; lsubPr = NULL; - xusubPr = NULL; usubPr = NULL; - } - - if (VInfo->maxNvtcsNds_loc) - Llu_symbfact->cntelt_vtcsA_lvl = - (int_t *) SUPERLU_MALLOC (VInfo->maxNvtcsNds_loc * lword); - - if (PS->maxSzLPr < Llu_symbfact->indLsubPr) - PS->maxSzLPr = Llu_symbfact->indLsubPr; - if (PS->maxSzUPr < Llu_symbfact->indUsubPr) - PS->maxSzUPr = Llu_symbfact->indUsubPr; - - Llu_symbfact->lsubPr = lsubPr; - Llu_symbfact->xlsubPr = xlsubPr; - Llu_symbfact->usubPr = usubPr; - Llu_symbfact->xusubPr = xusubPr; - Llu_symbfact->szLsubPr = nzlmaxPr; - Llu_symbfact->szUsubPr = nzumaxPr; - Llu_symbfact->indLsubPr = 0; - Llu_symbfact->indUsubPr = 0; - - Llu_symbfact->no_expand_pr += no_expand_pr; - return 0; -} - -static float -allocPrune_domain -( - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data - structures */ - vtcsInfo_symbfact_t *VInfo, /* Input -local info on vertices - distribution */ - psymbfact_stat_t *PS /* Input -statistics */ - ) -/* - * Allocate storage for data structures necessary for pruned graphs. - * For those unpredictable size, make a guess as FILL * n. - * Return value: - * 0 if enough memory was available; - * otherwise, return the amount of space intended to allocate - * when memory allocation failure occurred. - */ -{ - int_t lword; - int_t nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr; - int_t nvtcs_loc, no_expand_pr, x_sz; - float alpha = 1.5; - int_t FILL = 2 * sp_ienv_dist(6); - - nvtcs_loc = VInfo->nvtcs_loc; - - no_expand_pr = 0; - lword = (int_t) sizeof(int_t); - - /* allocate memory to use during domain_symbolic routine */ - /* Guess for prune graph */ - x_sz = lstVtx - fstVtx; - nzlmaxPr = nzumaxPr = 2*FILL * x_sz; - - /* Integer pointers for L\U factors */ - if (x_sz != 0) { - xlsubPr = intMalloc_symbfact(x_sz+1); - xusubPr = intMalloc_symbfact(x_sz+1); - - lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword); - usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword); - - while ( !lsubPr || !usubPr ) { -#if 0 - if (!lsubPr) SUPERLU_FREE(lsubPr); - if (!usubPr) SUPERLU_FREE(usubPr); -#else /* XSL correction */ - if ( lsubPr ) SUPERLU_FREE(lsubPr); - if ( usubPr ) SUPERLU_FREE(usubPr); -#endif - - nzlmaxPr /= 2; nzlmaxPr = alpha * (float) nzlmaxPr; - nzumaxPr /= 2; nzumaxPr = alpha * (float) nzumaxPr; - - if ( nzumaxPr < x_sz ) { - fprintf(stderr, "Not enough memory to perform factorization.\n"); - return (PS->allocMem); - } - lsubPr = (void *) SUPERLU_MALLOC(nzlmaxPr * lword); - usubPr = (void *) SUPERLU_MALLOC(nzumaxPr * lword); - ++no_expand_pr; - } - } - else { - xlsubPr = NULL; - xusubPr = NULL; - } - - Llu_symbfact->lsubPr = lsubPr; - Llu_symbfact->xlsubPr = xlsubPr; - Llu_symbfact->usubPr = usubPr; - Llu_symbfact->xusubPr = xusubPr; - Llu_symbfact->szLsubPr = nzlmaxPr; - Llu_symbfact->szUsubPr = nzumaxPr; - Llu_symbfact->indLsubPr = 0; - Llu_symbfact->indUsubPr = 0; - Llu_symbfact->xlsub_rcvd = NULL; - Llu_symbfact->xusub_rcvd = NULL; - Llu_symbfact->cntelt_vtcsA_lvl = NULL; - - PS->maxSzLPr = Llu_symbfact->indLsubPr; - PS->maxSzUPr = Llu_symbfact->indUsubPr; - - Llu_symbfact->no_expand_pr = no_expand_pr; - Llu_symbfact->no_expcp = 0; - return 0; -} - -/************************************************************************/ -static -int symbfact_alloc -/************************************************************************/ -( - int_t n, /* Input - order of the matrix */ - int nprocs, /* Input - number of processors for the symbolic - factorization */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices - distribution */ - comm_symbfact_t *CS, /* Input -information on communication */ - psymbfact_stat_t *PS /* Input -statistics */ - ) -/* - * Allocate storage for the data structures common to symbolic factorization - * routines. For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * 0 if enough memory was available; - * otherwise, return the amount of space intended to allocate - * when memory allocation failure occurred. - */ -{ - int nlvls, p; /* no of levels in the separator tree */ - int_t lword, no_expand; - int_t *xsup, *supno; - int_t *lsub, *xlsub; - int_t *usub, *xusub; - int_t nzlmax, nzumax, nnz_a_loc; - int_t nvtcs_loc, *cntelt_vtcs; - float alpha = 1.5; - int_t FILL = sp_ienv_dist(6); - - nvtcs_loc = VInfo->nvtcs_loc; - nnz_a_loc = VInfo->nnz_ainf_loc + VInfo->nnz_asup_loc; - nlvls = (int) log2( (double) nprocs ) + 1; - no_expand = 0; - lword = sizeof(int_t); - - /* Guess for L\U factors */ - nzlmax = nzumax = FILL * nnz_a_loc; - - /* Integer pointers for L\U factors */ - supno = intMalloc_symbfact(nvtcs_loc+1); - xlsub = intMalloc_symbfact(nvtcs_loc+1); - xusub = intMalloc_symbfact(nvtcs_loc+1); - - lsub = (void *) SUPERLU_MALLOC(nzlmax * lword); - usub = (void *) SUPERLU_MALLOC(nzumax * lword); - - while ( !lsub || !usub ) { - if (!lsub) SUPERLU_FREE(lsub); - if (!usub) SUPERLU_FREE(usub); - - nzlmax /= 2; nzlmax = alpha * nzlmax; - nzumax /= 2; nzumax = alpha * nzumax; - - if ( nzumax < nnz_a_loc/2 ) { - fprintf(stderr, "Not enough memory to perform factorization.\n"); - return (PS->allocMem); - } - lsub = (void *) SUPERLU_MALLOC(nzlmax * lword); - usub = (void *) SUPERLU_MALLOC(nzumax * lword); - ++no_expand; - } - - if (nprocs == 1) - cntelt_vtcs = NULL; - else - cntelt_vtcs = intMalloc_symbfact (nvtcs_loc+1); - - /* allocate memory for communication data structures */ - CS->rcv_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1); - CS->snd_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1); - CS->ptr_rcvBuf = intMalloc_symbfact (2 * (int_t) nprocs ); - CS->rcv_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1); - CS->snd_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1); - - CS->snd_interSz = intMalloc_symbfact ((int_t) nlvls + 1); - CS->snd_LinterSz = intMalloc_symbfact ((int_t) nlvls + 1); - CS->snd_vtxinter = intMalloc_symbfact ((int_t) nlvls + 1); - CS->rcv_bufSz = 0; - CS->rcv_buf = NULL; - CS->snd_bufSz = 0; - CS->snd_buf = NULL; - - for (p = 0; p < nprocs; p++) { - CS->rcv_interLvl[p] = EMPTY; - CS->snd_interLvl[p] = EMPTY; - CS->rcv_intraLvl[p] = EMPTY; - CS->snd_intraLvl[p] = EMPTY; - } - - for (p = 0; p <= nlvls; p++) { - CS->snd_vtxinter[p] = EMPTY; - CS->snd_interSz[p] = 0; - CS->snd_LinterSz[p] = 0; - } - - Pslu_freeable->supno_loc = supno; - Llu_symbfact->lsub = lsub; - Llu_symbfact->xlsub = xlsub; - Llu_symbfact->usub = usub; - Llu_symbfact->xusub = xusub; - Llu_symbfact->szLsub = nzlmax; - Llu_symbfact->szUsub = nzumax; - Llu_symbfact->cntelt_vtcs = cntelt_vtcs; - - Llu_symbfact->no_expand = no_expand; - - return SUCCES_RET; -} /* SYMBFACT_ALLOC */ - -static int_t -symbfact_vtx -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int_t vtx, /* Input - vertex number to perform symbolic factorization */ - int_t vtx_lid, /* Input - local vertex number */ - int_t vtx_prid, /* Input - */ - int_t computeL, /* Input - TRUE when compute column L(:,vtx) - otheriwse compute row U(vtx, :) */ - int domain_symb, /* Input - if TRUE, computation corresponds to the independent - domain at the bottom of the separator tree */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - int_t snrep_lid, /* local index of current supernode reprezentative */ - int_t szSn, /* size of supernode with snrep_lid reprezentative */ - int_t *p_next, /* next element in sub structure */ - int_t *marker, - int_t *sub_rcvd, /* elements of node */ - int_t sub_rcvd_sz, /* size of sub to be explored */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS, - int_t *p_neltsVtxInit, - int_t *p_neltsVtx, - int_t *p_neltsVtx_CSep, - int_t *p_neltsZrVtx, - int_t *p_neltsMatched, - int_t mark_vtx, - int_t *p_prval_curvtx, - int_t vtx_bel_othSn, - int_t *p_vtx_bel_mySn - ) -{ - int_t x_aind_beg, x_aind_end, upd_lstSn; - int_t k, vtx_elt, ind, pr, pr_lid, mem_error, ii, jj, compRcvd; - int_t *xsub, *sub, *xsubPr, *subPr, *xsub_rcvd, *xsub_src, *sub_src; - int_t pr_elt, next, prval_curvtx, maxNvtcsPProc; - int_t neltsVtx, neltsMatched, neltsZrVtx, neltsZrSn, neltsVtx_CSep; - int_t neltsVtxInit, kk; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - upd_lstSn = FALSE; - prval_curvtx = *p_prval_curvtx; - neltsVtx_CSep = 0; - next = *p_next; - if (computeL) { - xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub; - xsub_rcvd = Llu_symbfact->xlsub_rcvd; - xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr; - } - else { - xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub; - xsub_rcvd = Llu_symbfact->xusub_rcvd; - xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr; - } - - x_aind_beg = xsub[vtx_lid]; - x_aind_end = xsub[vtx_lid + 1]; - xsub[vtx_lid] = next; - k = x_aind_beg; - /* while (sub[k] != EMPTY && k < x_aind_end) { */ - while (k < x_aind_end) { - if (sub[k] == EMPTY) - k = x_aind_end; - else { - vtx_elt = sub[k]; - if (!computeL) - if (marker[vtx_elt] == mark_vtx - 2) - if (vtx_elt < prval_curvtx) - prval_curvtx = vtx_elt; - marker[vtx_elt] = mark_vtx; - if (!computeL && vtx_elt == vtx) - printf ("Pe[%d] ERROR diag elt in U part vtx %d dom_s %d fstV %d lstV %d\n", - iam, vtx, domain_symb, fstVtx, lstVtx); - else { - sub[next] = vtx_elt; - next ++; - } - if (vtx_elt < lstVtx) neltsVtx_CSep ++; - k++; - } - } - neltsVtxInit = k - x_aind_beg; - PS->nops += neltsVtxInit; - - if (domain_symb) { - if (computeL) - VInfo->nnz_ainf_loc -= x_aind_end - x_aind_beg; - else - VInfo->nnz_asup_loc -= x_aind_end - x_aind_beg; - } - -#ifdef TEST_SYMB - printf ("compL %d vtx %d vtx_lid %d vtx_prid %d vtx_bel_othSn %d\n", - computeL, vtx, vtx_lid, vtx_prid, vtx_bel_othSn); - PrintInt10 ("A(:, v)", x_aind_end - x_aind_beg, &(sub[xsub[vtx_lid]])); -#endif - - ind = xsubPr[vtx_prid]; - if (vtx_bel_othSn == vtx) - upd_lstSn = TRUE; - - while (ind != EMPTY || upd_lstSn) { - if (upd_lstSn ) { - upd_lstSn = FALSE; - pr_lid = snrep_lid; - } - else { - pr_lid = subPr[ind]; - ind = subPr[ind - 1]; - } - - if (!computeL) - marker[vtx] = mark_vtx; - if (pr_lid >= VInfo->nvtcs_loc) { - compRcvd = TRUE; - xsub_src = xsub_rcvd; sub_src = sub_rcvd; - pr_lid -= VInfo->nvtcs_loc; - k = xsub_src[pr_lid] + RCVD_IND; - } - else { - compRcvd = FALSE; - xsub_src = xsub; sub_src = sub; - k = xsub_src[pr_lid]; - } - - PS->nops += xsub_src[pr_lid+1] - xsub_src[pr_lid]; - for (; k < xsub_src[pr_lid+1]; k++) { - pr_elt = sub_src[k]; - if (pr_elt >= vtx && marker[pr_elt] != mark_vtx) { - - /* TEST available memory */ - if (next >= x_aind_end) { - if (domain_symb) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, 0, - computeL, DOMAIN_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - } else if (mem_error = - psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, - computeL, LL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - - x_aind_end = xsub[vtx_lid + 1]; - if (computeL) sub = Llu_symbfact->lsub; - else sub = Llu_symbfact->usub; - if (!compRcvd) - sub_src = sub; - } - - sub[next] = pr_elt; next ++; - - if (pr_elt < lstVtx) neltsVtx_CSep ++; - if (!computeL) - if (marker[pr_elt] == mark_vtx - 2) - if (pr_elt < prval_curvtx) - prval_curvtx = pr_elt; - marker[pr_elt] = mark_vtx; - } - } - } - - neltsVtx = next - xsub[vtx_lid]; - neltsZrVtx = 0; /* number of zero elements which would - be introduced in the vertex */ - neltsZrSn = 0; /* -"- in the supernode */ - neltsMatched = 0; - if (vtx != fstVtx) { - for (k = xsub[snrep_lid]; k < xsub[snrep_lid+1]; k++) { - vtx_elt = sub[k]; - if (vtx_elt >= vtx) { - if ((vtx_elt > vtx && !computeL) || - (vtx_elt >= vtx && computeL)) { - if (marker[vtx_elt] != mark_vtx) - neltsZrVtx ++; - else { - neltsMatched ++; - } - } - if (computeL && vtx_elt == vtx) - *p_vtx_bel_mySn = vtx; - if (!computeL && vtx_elt == vtx + 1) - *p_vtx_bel_mySn = vtx + 1; - } - } - } - else { - neltsMatched = neltsVtx; - if (! computeL) - for (k = xsub[vtx_lid]; k < next; k++) { - vtx_elt = sub[k]; - if (vtx_elt == vtx + 1) - *p_vtx_bel_mySn = vtx + 1; - } - } - - *p_neltsVtxInit = neltsVtxInit; - *p_neltsVtx = neltsVtx; - *p_neltsVtx_CSep = neltsVtx_CSep; - *p_neltsZrVtx = neltsZrVtx; - *p_neltsMatched = neltsMatched; - *p_next = next; - *p_prval_curvtx = prval_curvtx; - return SUCCES_RET; -} - -static int_t -updateRcvd_prGraph -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int_t *sub_rcvd, /* elements of node */ - int_t sub_rcvd_sz, /* Input - size of sub to be used in the update */ - int_t fstVtx_toUpd, /* Input - first vertex to update */ - int_t lstVtx_toUpd, /* Input - last vertex to update */ - int_t pr_offset, - int computeL, - int_t *marker, - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - psymbfact_stat_t *PS - /* marker: first elements of marker contain the nodes that will - be used in the updates */ -) -{ - int_t i, k, nelts, prVal, vtx_elt, vtx_elt_lid, ind; - int_t vtx, vtx_lid, fstVtx_toUpd_lid, fstVtx_srcUpd_lid; - int_t *xsub, *sub, *xsub_rcvd, *xsubPr, *subPr, szsubPr, *p_indsubPr; - int_t maxNvtcsPProc, *globToLoc, mem_error; - int_t nvtcs_toUpd, fstVtx_srcUpd, vtx_lid_p; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] ); - nvtcs_toUpd = lstVtx_toUpd - fstVtx_toUpd; - - if (computeL) { - xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub; - xsub_rcvd = Llu_symbfact->xlsub_rcvd; - xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr; - p_indsubPr = &(Llu_symbfact->indLsubPr); - szsubPr = Llu_symbfact->szLsubPr; - } - else { - xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub; - xsub_rcvd = Llu_symbfact->xusub_rcvd; - xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr; - p_indsubPr = &(Llu_symbfact->indUsubPr); - szsubPr = Llu_symbfact->szUsubPr; - } - - /* count number of elements in transpose representation of sub_rcvd */ - /* use marker to count those elements */ - for (i = 0; i < nvtcs_toUpd; i++) - marker[i] = 0; - for (i = 0; i <= VInfo->maxSzBlk; i++) - xsub_rcvd[i] = 0; - - i = 0; - fstVtx_srcUpd = EMPTY; - while (i < sub_rcvd_sz) { - vtx = sub_rcvd[i + DIAG_IND]; - nelts = sub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - prVal = sub_rcvd[i]; - if (fstVtx_srcUpd == EMPTY) fstVtx_srcUpd = vtx; - xsub_rcvd[vtx - fstVtx_srcUpd] = i - RCVD_IND; - xsub_rcvd[vtx-fstVtx_srcUpd+1] = i + nelts; - for (k = i; k < i + nelts; k++) { - vtx_elt = sub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt] ) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - - fstVtx_toUpd_lid; - marker[vtx_elt_lid] ++; - } - } - } - } - i += nelts; - } - - vtx_lid = fstVtx_toUpd_lid - pr_offset; - ind = 0; - for (i = 0; i < nvtcs_toUpd; i++) { - if (marker[i] != 0) { - xsubPr[vtx_lid] = ind + 1; - ind += 2* marker[i]; - marker[i] = xsubPr[vtx_lid] - 1; - } - vtx_lid ++; - } - - if (ind == 0) - /* quick return if no update */ - return; - - /* test if enough memory in usubPr array */ - if (ind >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) - return (mem_error); - if (computeL) - subPr = Llu_symbfact->lsubPr; - else - subPr = Llu_symbfact->usubPr; - } - *p_indsubPr = ind; - - i = 0; - while (i < sub_rcvd_sz) { - vtx = sub_rcvd[i + DIAG_IND]; - nelts = sub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - prVal = sub_rcvd[i]; - for (k = i; k < i + nelts; k++) { - vtx_elt = sub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt] ) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ); - vtx_lid_p = vtx_elt_lid - pr_offset; - vtx_elt_lid -= fstVtx_toUpd_lid; - /* add vtx to structure of pruned graph */ - if (marker[vtx_elt_lid] != xsubPr[vtx_lid_p] - 1) - subPr[marker[vtx_elt_lid] - 2] = marker[vtx_elt_lid] + 1; - subPr[marker[vtx_elt_lid] + 1] = vtx - fstVtx_srcUpd + VInfo->nvtcs_loc; - subPr[marker[vtx_elt_lid]] = EMPTY; - marker[vtx_elt_lid] += 2; - } - } - } - } - i += nelts; - } - - for (i = fstVtx_toUpd; i < nvtcs_toUpd; i++) - marker[i] = 0; -} - -static int_t -update_prGraph -( - int iam, - int_t n, /* order of the matrix */ - int_t fstVtx_blk, /* first vertex in block to factorize */ - int_t lstVtx_blk, /* last vertex in block to factorize */ - int_t snrep_lid, /* local index of current supernode reprezentative */ - int_t pr_offset, /* offset in the indexing of prune structure */ - int_t prval_cursn, /* prune value of current supernode reprezentative */ - int_t xsub_snp1, /* denotes xsub[snrep_lid + 1] */ - int computeL, /* Input - if 1, compute column L(:,vtx) - else compute row U(vtx, :) */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - psymbfact_stat_t *PS - ) -{ - int_t k, mem_error; - int_t kmin, kmax, ktemp, maxElt; - int_t sn_elt, sn_elt_prid; - int_t *globToLoc, maxNvtcsPProc; - int_t *xsub, *sub, *xsubPr, *subPr; - int_t *p_indsubPr, szsubPr; - - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - - if (computeL) { - xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub; - xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr; - p_indsubPr = &(Llu_symbfact->indLsubPr); - szsubPr = Llu_symbfact->szLsubPr; - } - else { - xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub; - xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr; - p_indsubPr = &(Llu_symbfact->indUsubPr); - szsubPr = Llu_symbfact->szUsubPr; - } - - kmin = xsub[snrep_lid]; - kmax = xsub_snp1 - 1; - if (prval_cursn != n) - maxElt = prval_cursn; - else - maxElt = EMPTY; - while (kmin <= kmax) { - if (prval_cursn == n) { - /* compute maximum element of L(:, vtx) */ - if (sub[kmin] > maxElt) - maxElt = sub[kmin]; - kmin ++; - } - else { - /* Do a quicksort-type partition. */ - if (sub[kmax] > prval_cursn) - kmax--; - else if (sub[kmin] <= prval_cursn) - kmin++; - else { /* kmin does'nt belong to G^s(L), and kmax belongs: - * interchange the two subscripts - */ - ktemp = sub[kmin]; - sub[kmin] = sub[kmax]; - sub[kmax] = ktemp; - kmin ++; - kmax --; - } - } - } - k = xsub[snrep_lid]; - while (sub[k] <= prval_cursn && k < xsub_snp1) { - sn_elt = sub[k]; - if (sn_elt < lstVtx_blk) { - sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset; - if ((*p_indsubPr) + 2 >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) - return (mem_error); - if (computeL) { - subPr = Llu_symbfact->lsubPr; szsubPr = Llu_symbfact->szLsubPr; - } - else { - subPr = Llu_symbfact->usubPr; szsubPr = Llu_symbfact->szUsubPr; - } - } - /* add krow to structure of pruned graph */ - subPr[(*p_indsubPr) + 1] = snrep_lid; - subPr[(*p_indsubPr)] = xsubPr[sn_elt_prid]; - xsubPr[sn_elt_prid] = (*p_indsubPr) + 1; - (*p_indsubPr) += 2; - } - if (sn_elt == maxElt) { - /* move prune val in the first position */ - sub[k] = sub[xsub[snrep_lid]]; - sub[xsub[snrep_lid]] = sn_elt; - } - k ++; - } - return SUCCES_RET; -} - -static int_t -blk_symbfact -(SuperMatrix *A, - int iam, - int lvl, - int szSep, - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t fstVtx_loc, /* Input - first vertex local of the level */ - int_t fstVtx_blk, - int_t lstVtx_blk, - int_t *lsub_rcvd, /* elements of node */ - int_t lsub_rcvd_sz, /* size of sub to be explored */ - int_t *usub_rcvd, - int_t usub_rcvd_sz, - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *marker, - int_t *p_mark, /* marker used to merge elements of vertices */ - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_neltsZr, /* no of artificial zeros introduced so far */ - int_t *p_neltsTotal, /* no of nonzeros (including artificials) - computed so far */ - int_t *p_nsuper_loc - ) -{ - int szSep_tmp, lvl_tmp, ii, jj; - int_t *xlsubPr, *xusubPr; - int_t *xsup, *supno, *lsub, *xlsub, *usub, *xusub; - int_t vtx_lid, vtx_prid, vtx, vtx_super, vtx_elt, maxNvtcsPProc; - int_t ind, pr, pr_elt, newnext, k, vtx_elt_lid; - int_t nextl, nextu, nsuper_loc, nvtcs, n, mem_error; - int_t x_aind_beg, x_aind_end, i, szLp, xlsub_snp1, xusub_snp1; - int_t snrep, snrep_lid, szsn, vtxp1, *globToLoc, domain_symb; - int_t lstVtx, neltsCurSep, maxNeltsVtx, fstVtx_loc_lid; - /* supernode relaxation parameters */ - int_t neltsVtx_L, neltsZrVtx_L, neltsMatched_L, neltsVtx_CSep_L; - int_t neltsVtx_U, neltsZrVtx_U, neltsMatched_U, neltsVtx_CSep_U; - int_t neltsZrSn_L, neltsZrSn_U, neltsZr, neltsTotal, - neltsZr_tmp, neltsTotal_tmp, neltsZrSn, neltsVtxInit_l, neltsVtxInit_u; - /* next vertex belongs to current supernode pruned structure */ - int_t vtx_bel_snL, vtx_bel_snU; - /* marker variables */ - int_t markl1_vtx, markl2_vtx, marku1_vtx, marku2_vtx; - /* prune structure variables */ - int_t prval_cursn, prval_curvtx, pr_offset; - /* variables for comms info */ - int_t neltSn_L, neltSn_U, lstVtx_tmp, stat; - float relax_param, relax_seps; - - if (fstVtx_blk >= lstVtx_blk) - return; - - /* Initializations */ - supno = Pslu_freeable->supno_loc; - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - xusubPr = Llu_symbfact->xusubPr; - xlsubPr = Llu_symbfact->xlsubPr; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - maxNeltsVtx = VInfo->maxNeltsVtx; - - n = A->ncol; - nextl = *p_nextl; - nextu = *p_nextu; - neltsZr = *p_neltsZr; - neltsTotal = *p_neltsTotal; - nsuper_loc = *p_nsuper_loc; - marku2_vtx = *p_mark; - lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2]; - - snrep = fstVtx_blk; - snrep_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - szsn = 1; - nvtcs = lstVtx_blk - fstVtx_blk; - prval_cursn = n; - vtx_bel_snL = EMPTY; vtx_bel_snU = EMPTY; - - /* set up to EMPTY xlsubPr[], xusubPr[] */ - if (PS->maxSzLPr < Llu_symbfact->indLsubPr) - PS->maxSzLPr = Llu_symbfact->indLsubPr; - if (PS->maxSzUPr < Llu_symbfact->indUsubPr) - PS->maxSzUPr = Llu_symbfact->indUsubPr; - for (i = 0; i < nvtcs; i++) { - xlsubPr[i] = EMPTY; - xusubPr[i] = EMPTY; - } - Llu_symbfact->indLsubPr = 0; - Llu_symbfact->indUsubPr = 0; - - if (ind_sizes1 == 0) - domain_symb = TRUE; - else { - domain_symb = FALSE; - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - } - - vtx_prid = 0; - vtx_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - pr_offset = vtx_lid; - - if (lsub_rcvd != NULL) { - updateRcvd_prGraph (n, iam, lsub_rcvd, lsub_rcvd_sz, - fstVtx_blk, lstVtx_blk, pr_offset, 1, marker, - Pslu_freeable, Llu_symbfact, VInfo, PS); - updateRcvd_prGraph (n, iam, usub_rcvd, usub_rcvd_sz, - fstVtx_blk, lstVtx_blk, pr_offset, 0, marker, - Pslu_freeable, Llu_symbfact, VInfo, PS); - } - - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++, vtx_prid ++) { - vtxp1 = vtx + 1; - if (marku2_vtx +4 >= n) { - /* reset to EMPTY marker array */ - for (i = 0; i < n; i++) - marker[i] = EMPTY; - marku2_vtx = EMPTY; - } - markl1_vtx = marku2_vtx + 1; markl2_vtx = markl1_vtx + 1; - marku1_vtx = markl2_vtx + 1; marku2_vtx = marku1_vtx + 1; - - prval_curvtx = n; - /* Compute nonzero structure L(:,vtx) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextl, - marker, - lsub_rcvd, lsub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l, - &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, - &neltsMatched_L, markl1_vtx, &prval_curvtx, - vtx_bel_snU, &vtx_bel_snL)) - return (mem_error); - lsub = Llu_symbfact->lsub; - -#ifdef TEST_SYMB - PrintInt10 ("L(:, %d)", nextl - xlsub[vtx_lid], &(lsub[xlsub[vtx_lid]])); -#endif - - /* Compute nonzero structure of U(vtx,:) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextu, - marker, - usub_rcvd, usub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u, - &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, - &neltsMatched_U, marku1_vtx, &prval_curvtx, - vtx_bel_snL, &vtx_bel_snU)) - return (mem_error); - usub = Llu_symbfact->usub; - -#ifdef TEST_SYMB - PrintInt10 ("U(%d, :)", nextu - xusub[vtx_lid], &(usub[xusub[vtx_lid]])); -#endif - - /* update statistics on fill-in */ - if (!domain_symb) { - stat = CEILING( (neltsVtxInit_l + neltsVtxInit_u), 2); - if (Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] != stat) { - stat = CEILING(stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid]); - PS->fill_pelt[0] += (float) stat; - if ((float) stat > PS->fill_pelt[1]) PS->fill_pelt[1] = (float) stat; - PS->fill_pelt[2] += 1.; - } - stat = CEILING( (neltsVtx_L + neltsVtx_U), 2); - stat = CEILING( stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] ); - PS->fill_pelt[3] += (float) stat; - if ((float) stat > PS->fill_pelt[4]) PS->fill_pelt[4] = (float) stat; - PS->fill_pelt[5] += 1.; - } - - /* compute number of artificial zeros */ - neltsTotal = 0; - neltsZr = 0; - neltsZrSn_L = neltsVtx_L - neltsMatched_L; - neltsZrSn_U = neltsVtx_U - neltsMatched_U; - neltsZrSn = neltsZrVtx_L + neltsZrVtx_U + - (neltsZrSn_L + neltsZrSn_U) * szsn; - neltsZr_tmp = neltsZr + neltsZrSn; - neltsTotal_tmp = neltsTotal + neltsZrSn + neltsVtx_L + neltsVtx_U; - if (neltsTotal_tmp == 0) - neltsTotal_tmp = 1; - relax_param = (float) (neltsTotal_tmp - neltsZr_tmp) / neltsTotal_tmp; - -#ifdef TEST_SYMB - printf ("[%d] vtx %d pr %d szsn %d nVtx_L %d nZrSn_L %d nZrVtx_L %d\n", - iam, vtx, prval_curvtx, szsn,neltsVtx_L, neltsZrSn_L, neltsZrVtx_L); - printf (" [%d] nVtx_U %d, nZrSn_U %d nZrVtx_U %d nextl %d nextu %d\n", - iam, neltsVtx_U, neltsZrSn_U, neltsZrVtx_U, nextl, nextu); - printf (" [%d] nZr %d nZr_tmp %d nTot %d nTot_tmp %d rel %f test %d\n\n", - iam, neltsZr, neltsZr_tmp, neltsTotal, neltsTotal_tmp, - relax_param, i); -#endif - - /* Check to see if vtx belongs in the same supernode as vtx-1 */ - supno[vtx_lid] = nsuper_loc; - if (vtx == fstVtx_blk) { - prval_cursn = prval_curvtx; - neltsTotal += neltsVtx_L + neltsVtx_U; - } - else { - if (maxNeltsVtx > 0) { - relax_seps = (float) neltsVtx_L / (float) maxNeltsVtx; - relax_seps *= (float) (neltsVtx_U+1) / (float) maxNeltsVtx; - } - else - relax_seps = 0.0; - - /* check if all upper separators are dense */ - if (relax_seps >= PS->relax_seps ) { - VInfo->filledSep = FILLED_SEPS; - *p_nextl = xlsub[vtx_lid]; - *p_nextu = xusub[vtx_lid]; - nsuper_loc += 1; - *p_nsuper_loc = nsuper_loc; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, vtx, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) - return (mem_error); - /* set up neltsZr and neltsTotal */ - vtx = lstVtx_blk; - return 0; - } /* if all upper separators are dense */ - else { - if (relax_param >= PS->relax_gen) { - /* vertex belongs to the same supernode */ - if (prval_cursn > prval_curvtx || prval_cursn <= vtx) - prval_cursn = prval_curvtx; - neltsZr = neltsZr_tmp; - neltsTotal = neltsTotal_tmp; - szsn ++; - /* add artificial zeros at the structure of current supernode */ - newnext = xlsub[snrep_lid+1]; - if (neltsZrSn_L != 0) { - for (k = xlsub[snrep_lid]; k < xlsub[snrep_lid+1]; k++) { - vtx_elt = lsub[k]; - if (vtx_elt >= vtx) - marker[vtx_elt] = markl2_vtx; - } - for (k = xlsub[vtx_lid]; k < nextl; k++) { - vtx_elt = lsub[k]; - if (marker[vtx_elt] != markl2_vtx) { - /* add vtx_elt to the structure of snrep */ - lsub[newnext] = vtx_elt; newnext ++; - marker[vtx_elt] = markl2_vtx; - } - } - xlsub[snrep_lid+1] = newnext; - } - xlsub[vtx_lid] = newnext; - nextl = newnext; - neltsVtx_L += neltsZrVtx_L; - - newnext = xusub[snrep_lid+1]; - if (neltsZrSn_U != 0) { - for (k = xusub[snrep_lid]; k < xusub[snrep_lid+1]; k++) { - vtx_elt = usub[k]; - if (vtx_elt >= vtx) { - if (marker[vtx_elt] == markl2_vtx) - if (prval_cursn > vtx_elt && vtx_elt != vtx) - prval_cursn = vtx_elt; - marker[vtx_elt] = marku2_vtx; - } - } - for (k = xusub[vtx_lid]; k < nextu; k++) { - vtx_elt = usub[k]; - if (marker[vtx_elt] != marku2_vtx) { - /* add vtx_elt to the structure of snrep */ - usub[newnext] = vtx_elt; newnext ++; - if (marker[vtx_elt] == markl2_vtx) - if (prval_cursn > vtx_elt && vtx_elt != vtx) - prval_cursn = vtx_elt; - marker[vtx_elt] = marku2_vtx; - } - } - if (marker[vtxp1] == marku2_vtx) - vtx_bel_snU = vtxp1; - xusub[snrep_lid+1] = newnext; - } - xusub[vtx_lid] = newnext; - nextu = newnext; - neltsVtx_U += neltsZrVtx_U; - } /* if ( relax_param >= PS->relax_param) */ - } /* if (VInfo->filledSep != FILLED_SEPS) */ - } /* if (vtx != fstVtx_blk) */ - - if ((relax_param < PS->relax_gen || vtx == lstVtx_blk-1) - && VInfo->filledSep != FILLED_SEPS) { - /* if a new supernode starts or is the last vertex */ - /* vtx starts a new supernode. Note we only store the - * subscript set of the first column of a supernode. */ - - if (marker[vtxp1] == marku1_vtx) - vtx_bel_snU = vtxp1; - /* build the pruned structure */ - if (relax_param < PS->relax_gen - && vtx == lstVtx_blk - 1 && vtx != fstVtx_blk) - szLp = 2; - else - szLp = 1; - if (vtx == fstVtx_blk) { - xlsub_snp1 = nextl; - xusub_snp1 = nextu; - } - else { - xlsub_snp1 = xlsub[snrep_lid+1]; - xusub_snp1 = xusub[snrep_lid+1]; - } - while (szLp > 0) { - szLp --; -#ifdef TEST_SYMB - printf ("End sn %d szsn %d\n", nsuper_loc, szsn); - printf ("BLD pr vtx %d snrep %d prval %d szLp %d\n", - vtx, snrep, prval_cursn, szLp); -#endif - - update_prGraph (iam, n, fstVtx_blk, lstVtx_blk, - snrep_lid, pr_offset, prval_cursn, - xlsub_snp1, 1, - Pslu_freeable, Llu_symbfact, PS); - update_prGraph (iam, n, fstVtx_blk, lstVtx_blk, - snrep_lid, pr_offset, prval_cursn, - xusub_snp1, 0, - Pslu_freeable, Llu_symbfact, PS); - -#ifdef TEST_SYMB - printf ("Adr lsub %p usub %p lsub %p pos %d usub %p pos %d\n", - &(lsub[xlsub[snrep_lid]]), &(usub[xusub[snrep_lid]]), - lsub, xlsub[snrep_lid], usub, xusub[snrep_lid]); - PrintInt10 ("Lsn", xlsub_snp1 - xlsub[snrep_lid], - &(lsub[xlsub[snrep_lid]])); - PrintInt10 ("Usn", xusub_snp1 - xusub[snrep_lid], - &(usub[xusub[snrep_lid]])); -#endif - - if (prval_cursn >= lstVtx_blk) { - neltSn_L = xlsub_snp1 - xlsub[snrep_lid]; - neltSn_U = xusub_snp1 - xusub[snrep_lid]; - if (ind_sizes1 != 0) { - CS->snd_intraSz += neltSn_L + neltSn_U + 4; - CS->snd_LintraSz += neltSn_L + 2; - } - if (prval_cursn >= lstVtx) { - /* this supernode will be send to next layers of the tree */ - lvl_tmp = lvl; - ii = ind_sizes1; - jj = ind_sizes2; - szSep_tmp = szSep; - lstVtx_tmp = lstVtx; - while (prval_cursn >= lstVtx_tmp && szSep_tmp != 1) { - jj = ii + szSep_tmp + (jj - ii) / 2; - ii += szSep_tmp; - lvl_tmp ++; - szSep_tmp = szSep_tmp / 2; - lstVtx_tmp = fstVtxSep[jj] + sizes[jj]; - CS->snd_interSz[lvl_tmp] += neltSn_L + neltSn_U + 4; - CS->snd_LinterSz[lvl_tmp] += neltSn_L + 2; - if (CS->snd_vtxinter[lvl_tmp] == EMPTY) - CS->snd_vtxinter[lvl_tmp] = snrep; - } - } - } - snrep = vtx; - snrep_lid = vtx_lid; - prval_cursn = prval_curvtx; - szsn = 1; - xlsub_snp1 = nextl; - xusub_snp1 = nextu; - } - if (relax_param < PS->relax_gen) { - neltsTotal += neltsVtx_L + neltsVtx_U; - nsuper_loc ++; - supno[vtx_lid] = nsuper_loc; - if (marker[vtxp1] == marku1_vtx) - vtx_bel_snU = vtxp1; - else - vtx_bel_snU = EMPTY; - } - } - if (vtx == lstVtx_blk - 1) - nsuper_loc ++; - - /* check if current separator is dense */ - if (!VInfo->filledSep) { - relax_seps = (float) neltsVtx_CSep_L / (float) (lstVtx - vtx); - relax_seps *= (float) (neltsVtx_CSep_U+1) / (float) (lstVtx - vtx); - if (relax_seps >= PS->relax_curSep ) - VInfo->filledSep = FILLED_SEP; - } - maxNeltsVtx --; - } - - *p_mark = marku2_vtx + 1; - *p_nextl = nextl; - *p_nextu = nextu; - *p_neltsZr = neltsZr; - *p_neltsTotal = neltsTotal; - *p_nsuper_loc = nsuper_loc; - - return 0; -} - -static void -domain_symbfact -(SuperMatrix *A, - int iam, /* Input - my processor number */ - int lvl, /* Input - current level in the separator tree */ - int szSep, /* Input - size of the current separator (node) */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *marker, - int_t *p_mark, /* marker used to merge elements of vertices */ - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_neltsZr, /* no of artificial zeros introduced so far */ - int_t *p_neltsTotal, /* no of nonzeros (including artificials) - computed so far */ - int_t *p_nsuper_loc - ) -{ - int_t lstVtx_lid, maxNvtcsPProc; - - /* call blk_symbfact */ - blk_symbfact (A, iam, lvl, - szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - EMPTY, fstVtx, lstVtx, - NULL, EMPTY, NULL, EMPTY, - Pslu_freeable, Llu_symbfact, VInfo, CS, PS, - marker, p_mark, - p_nextl, p_nextu, p_neltsZr, p_neltsTotal, - p_nsuper_loc); - - if (VInfo->filledSep != FILLED_SEPS) { - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - if (fstVtx >= lstVtx) - lstVtx_lid = 0; - else - lstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[lstVtx-1] ) + 1; - VInfo->xlsub_nextLvl = Llu_symbfact->xlsub[lstVtx_lid]; - Llu_symbfact->xlsub[lstVtx_lid] = *p_nextl; - VInfo->xusub_nextLvl = Llu_symbfact->xusub[lstVtx_lid]; - Llu_symbfact->xusub[lstVtx_lid] = *p_nextu; - } - VInfo->maxNeltsVtx -= lstVtx - fstVtx; -} - -/* - * Compute counts of rows/columns of current separator. - * cntelt_vtcs[i] is 0 when i is nonzero before current separator - * and n when i is zero before current separator. - * - * Set up nvtcsLvl_loc. - */ -static void -initLvl_symbfact -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS, - MPI_Comm ndComm, - int_t *marker, - int_t nextl, - int_t nextu - ) -{ - int_t *cntelt_vtcs, x_aind_beg, x_aind_end, x_aind_beg_l, x_aind_beg_u, - nelts_asup, nelts_ainf; - int_t nvtcsLvl_loc, fstVtx_loc, fstVtx_loc_lid, fstVtx_nextLvl; - int_t curblk_loc, nblks_loc, ind_blk; - int_t *lsub, *xlsub, *usub, *xusub; - int_t *begEndBlks_loc, code_err, mem_error; - int_t i, j, k, vtx, vtx_lid, fstVtx_blk, lstVtx_blk, vtx_elt, p, fill; - int_t nelts, nelts_fill_l, nelts_fill_u, nelts_cnts, maxNvtcsPProc, *globToLoc; - int_t use_fillcnts, cntelt_vtx_l, cntelt_vtx_u; - MPI_Status status; - - fill = PS->fill_par; - VInfo->filledSep = FALSE; - - /* Initializations */ - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - curblk_loc = VInfo->curblk_loc; - nblks_loc = VInfo->nblks_loc; - begEndBlks_loc = VInfo->begEndBlks_loc; - cntelt_vtcs = Llu_symbfact->cntelt_vtcs; - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - - /* compute nvtcsLvl_loc */ - nvtcsLvl_loc = 0; - ind_blk = curblk_loc; - while (fstVtx > begEndBlks_loc[ind_blk] && ind_blk < 2 * nblks_loc) { - ind_blk += 2; - } - curblk_loc = ind_blk; - fstVtx_loc = begEndBlks_loc[ind_blk]; - while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) { - nvtcsLvl_loc += begEndBlks_loc[ind_blk + 1] - - begEndBlks_loc[ind_blk]; - ind_blk += 2; - } - fstVtx_nextLvl = begEndBlks_loc[ind_blk]; - VInfo->nvtcsLvl_loc = nvtcsLvl_loc; - VInfo->curblk_loc = curblk_loc; - - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - vtx_lid = fstVtx_loc_lid; - x_aind_beg_l = VInfo->xlsub_nextLvl; - x_aind_beg_u = VInfo->xusub_nextLvl; - nelts_cnts = 0; - nelts_fill_l = 0; - nelts_fill_u = 0; - ind_blk = curblk_loc; - - while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) { - fstVtx_blk = begEndBlks_loc[ind_blk]; - lstVtx_blk = begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) - nelts_cnts += cntelt_vtcs[vtx_lid]; - nelts_fill_l += fill * (xlsub[vtx_lid] - x_aind_beg_l); - nelts_fill_u += fill * (xusub[vtx_lid] - x_aind_beg_u); - x_aind_beg_l = xlsub[vtx_lid]; - x_aind_beg_u = xusub[vtx_lid]; - } - - if (nvtcsLvl_loc != 0) { - nelts_ainf = xlsub[vtx_lid] - VInfo->xlsub_nextLvl; - nelts_asup = xusub[vtx_lid] - VInfo->xusub_nextLvl; - } - else { - nelts_ainf = 0; - nelts_asup = 0; - } - - use_fillcnts = FALSE; - if (nextl + nelts_cnts >= Llu_symbfact->szLsub - nelts_ainf || - nextu + nelts_cnts >= Llu_symbfact->szUsub - nelts_asup) { - use_fillcnts = TRUE; - } - - use_fillcnts = TRUE; - - if (use_fillcnts) { - if (nextl + nelts_fill_l >= Llu_symbfact->szLsub - nelts_ainf) - mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx, nextl, - nextl + nelts_fill_l, LSUB, - RL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - if (nextu + nelts_fill_u >= Llu_symbfact->szUsub - nelts_asup) - mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx, nextu, - nextu + nelts_fill_u, USUB, - RL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - - /* init xlsub[fstVtx:lstVtx] and xusub[fstVtx:lstVtx] and - copy elements of A[fstVtx:lstVtx, fstVtx:lstVtx] in lsub and usub */ - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - x_aind_beg_l = VInfo->xlsub_nextLvl; - x_aind_beg_u = VInfo->xusub_nextLvl; - vtx_lid = fstVtx_loc_lid; - ind_blk = curblk_loc; - - while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) { - fstVtx_blk = begEndBlks_loc[ind_blk]; - lstVtx_blk = begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) { - if (vtx_lid != fstVtx_loc_lid) { - x_aind_beg_l = xlsub[vtx_lid]; - x_aind_beg_u = xusub[vtx_lid]; - } - if (use_fillcnts) { - cntelt_vtx_l = fill * (xlsub[vtx_lid+1] - x_aind_beg_l); - cntelt_vtx_u = fill * (xusub[vtx_lid+1] - x_aind_beg_u); - } - else { - cntelt_vtx_l = cntelt_vtcs[vtx_lid]; - cntelt_vtx_u = cntelt_vtcs[vtx_lid]; - } - x_aind_end = xlsub[vtx_lid + 1]; - Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] = - CEILING( (xlsub[vtx_lid+1]-x_aind_beg_l + xusub[vtx_lid+1]-x_aind_beg_u), 2); - - xlsub[vtx_lid] = nextl; - nelts = 0; - for (k = x_aind_beg_l; k < x_aind_end; k++) { - lsub[nextl] = lsub[k]; nextl ++; - nelts ++; - } - if (nelts < cntelt_vtx_l) - lsub[nextl] = EMPTY; - nextl += cntelt_vtx_l - nelts; - x_aind_end = xusub[vtx_lid + 1]; - xusub[vtx_lid] = nextu; - nelts = 0; - for (k = x_aind_beg_u; k < x_aind_end; k++) { - usub[nextu] = usub[k]; nextu ++; - nelts ++; - } - if (nelts < cntelt_vtx_u) - usub[nextu] = EMPTY; - nextu += cntelt_vtx_u - nelts; - } - } - - if (nvtcsLvl_loc == 0) { - if (curblk_loc == 0) - vtx_lid = 0; - else { - if (begEndBlks_loc[curblk_loc-1] == 0) - vtx_lid = 0; - else - vtx_lid = LOCAL_IND( globToLoc[begEndBlks_loc[curblk_loc-1] - 1] ) + 1; - } - - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - } - else { - VInfo->xlsub_nextLvl = xlsub[vtx_lid]; - xlsub[vtx_lid] = nextl; - VInfo->xusub_nextLvl = xusub[vtx_lid]; - xusub[vtx_lid] = nextu; - if (PS->estimLSz < nextl) - PS->estimLSz = nextl; - if (PS->estimUSz < nextu) - PS->estimUSz = nextu; - - VInfo->nnz_ainf_loc -= nelts_ainf; - VInfo->nnz_asup_loc -= nelts_asup; - } - VInfo->fstVtx_nextLvl = fstVtx_nextLvl; -} - - -static int_t -expand_RL -( - int_t computeRcvd, /* if = 1, then update from receive buffer, - else update from own data */ - int_t n, - int iam, /* process number */ - int_t *lsub_rcvd, /* elements of node */ - int_t lsub_rcvd_sz, /* size of sub to be explored */ - int_t *usub_rcvd, - int_t usub_rcvd_sz, - int_t vtxXp, - int_t vtx_upd_pr, /* ind in pruned structure of upd vertex which - doesn't fit into the alloc memory */ - int_t lstVtx_upd_pr, /* ind in pruned structure of lst vtx to update */ - int_t fstVtx_srcUpd, /* first vertex source of the updates */ - int_t lstVtx_srcUpd, /* last vertex source of the updates */ - int_t fstVtx_toUpd, /* first vertex to update */ - int_t lstVtx_toUpd, /* last vertex to update */ - int_t nvtcs_toUpd, /* no of vertices to update */ - int computeL, - int_t *pmarkl, - int_t *marker, - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS - ) -{ - int_t fstVtx_toUpd_lid, vtx_lid, vtx, vtx_elt, vtx_elt_lid, nextl, nelts_in; - int_t i, ii, j, nelts, nelts_vtx, mpnelts, lvtx_lid, elt, vtxXp_lid; - int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub; - int_t markl, *globToLoc, maxNvtcsPProc; - int_t mem_error, len_texp; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - - xusubPr = Llu_symbfact->xlsubPr; usubPr = Llu_symbfact->lsubPr; - if (computeL) { - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - } - else { - xlsub = Llu_symbfact->xusub; lsub = Llu_symbfact->usub; - xusub = Llu_symbfact->xlsub; usub = Llu_symbfact->lsub; - } - markl = *pmarkl + 1; - fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] ); - vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] ); - nextl = xlsub[vtxXp_lid+1]; - - lvtx_lid = EMPTY; - if (lstVtx_srcUpd != EMPTY) - lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] ); - - /* count the number of new elements, and update Llu_symbfact->cntelt_vtcs */ - vtx_lid = fstVtx_toUpd_lid; - vtx_lid += vtx_upd_pr; - len_texp = 0; - for (i = vtx_upd_pr; i < lstVtx_upd_pr; i++, vtx_lid ++) { - nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - if (xusubPr[i] != xusubPr[i+1]) { - j = xusubPr[i]; - vtx = usubPr[j]; - /* setup marker structure for already existing elements */ - ii = xlsub[vtx_lid]; - while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) { - marker[lsub[ii]] = markl; - ii ++; - } - nelts_vtx = ii - xlsub[vtx_lid]; - for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) { - vtx_elt = usubPr[j]; - ii = marker[vtx_elt]; - if (computeRcvd) { - nelts = lsub_rcvd[ii + NELTS_IND]; - ii += RCVD_IND; - mpnelts = marker[vtx_elt] + nelts + RCVD_IND; - } - else { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ); - if (vtx_elt_lid == lvtx_lid) - nelts = lsub_rcvd_sz - ii; - else - nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid]; - mpnelts = marker[vtx_elt] + nelts; - } - - if (!computeL) - marker[vtx] = markl; - for (ii; ii < mpnelts; ii++) { - elt = lsub_rcvd[ii]; - if (elt >= vtx) { - if (marker[elt] != markl) { - /* add elt to structure of vtx */ - marker[elt] = markl; - nelts_vtx ++; - } - } - } - } - if (nelts_vtx != 0 && (nelts_vtx > xlsub[vtx_lid+1] - xlsub[vtx_lid])) { - nelts_in = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - if (nelts_in == 0) nelts_in = 1; - j = nelts_vtx / nelts_in; - if (nelts_vtx % nelts_in != 0) j++; - nelts_vtx = j * nelts_in; - } - else - nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - markl ++; - if (markl == n) { - /* reset marker array */ - for (j = fstVtx_toUpd; j < n; j++) - marker[j] = EMPTY; - markl = 0; - } - } - Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx; - len_texp += nelts_vtx; - } - for (; i < nvtcs_toUpd; i++, vtx_lid++) { - nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx; - len_texp += nelts_vtx; - } - - *pmarkl = markl; - /* mark elements array */ - for (i = xlsub[vtxXp_lid]; i < nextl; i++) { - marker[lsub[i]] = markl; - } - - nextl = xlsub[vtxXp_lid+1]; - if (mem_error = - psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, - computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - - return 0; -} - - -static int_t -rl_update -( - int computeRcvd, /* if = 1, then update from receive buffer, - else update from own data */ - int_t n, - int iam, /* process number */ - int_t *lsub_rcvd, /* elements of node */ - int_t lsub_rcvd_sz, /* size of sub to be explored */ - int_t *usub_rcvd, - int_t usub_rcvd_sz, - int_t fstVtx_srcUpd, /* first vertex source of the updates */ - int_t lstVtx_srcUpd, /* last vertex source of the updates */ - int_t indBlk_srcUpd, /* block index of first vertex */ - int_t fstVtx_toUpd, /* first vertex to update */ - int_t lstVtx_toUpd, /* last vertex to update */ - int_t nvtcs_toUpd, /* no of vertices to update */ - int computeL, - int_t *pmarkl, - int_t *marker, - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS - /* marker: first elements of marker contain the nodes that will - be used in the updates */ - ) -{ - int_t i, j, k, prVal, nelts, ind, nextl, ii, mpnelts, mem_error; - int_t vtx, vtx_lid, vtx_elt, vtx_elt_lid, lvtx_lid; - int_t fstVtx_toUpd_lid, markl, elt, vtx_loc, ind_blk; - int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub; - int_t fstVtx_upd, lstVtx_upd, maxNvtcsPProc, *globToLoc; - int_t fstVtx_srcUpd_lid, nelts_vtx, expand; - - /* quick return */ - if (fstVtx_toUpd >= lstVtx_toUpd) - return; - - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - - fstVtx_upd = EMPTY; - lstVtx_upd = EMPTY; - xusubPr = Llu_symbfact->xlsubPr; usubPr = Llu_symbfact->lsubPr; - if (computeL) { - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - } - else { - xlsub = Llu_symbfact->xusub; lsub = Llu_symbfact->usub; - xusub = Llu_symbfact->xlsub; usub = Llu_symbfact->lsub; - } - markl = *pmarkl; - fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] ); - - /* count number of elements in transpose representation of usub_rcvd */ - /* use marker to count those elements */ - for (i = 0; i < nvtcs_toUpd; i++) - marker[i] = 0; - - i = 0; - if (fstVtx_srcUpd != EMPTY) { - fstVtx_srcUpd_lid = LOCAL_IND( globToLoc[fstVtx_srcUpd] ); - vtx_lid = fstVtx_srcUpd_lid; - } - lvtx_lid = EMPTY; - if (lstVtx_srcUpd != EMPTY) - lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] ); - - while (i < usub_rcvd_sz) { - if (computeRcvd) { - vtx = usub_rcvd[i + DIAG_IND]; - nelts = usub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - } - else { - if (vtx_lid == lvtx_lid) - nelts = usub_rcvd_sz - i; - else - nelts = xusub[vtx_lid + 1] - xusub[vtx_lid]; - vtx_lid ++; - } - prVal = usub_rcvd[i]; - for (k = i; k < i + nelts; k++) { - vtx_elt = usub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt] ) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - - fstVtx_toUpd_lid; - marker[vtx_elt_lid] ++; - } - } - } - } - i += nelts; - } - - ind = 0; - for (i = 0; i < nvtcs_toUpd; i++) { - if (marker[i] != 0) { - marker[i] ++; - if (fstVtx_upd == EMPTY) - fstVtx_upd = i; - lstVtx_upd = i; - } - xusubPr[i] = ind; - ind += marker[i]; - marker[i] = xusubPr[i]; - } - xusubPr[i] = ind; - lstVtx_upd ++; - - if (ind == 0) - /* quick return if no update */ - return; - - /* test if enough memory in usubPr array */ - if (ind > Llu_symbfact->szLsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) - return (mem_error); - usubPr = Llu_symbfact->lsubPr; - } - - i = 0; - if (fstVtx_srcUpd != EMPTY) { - vtx_loc = fstVtx_srcUpd; - vtx_lid = LOCAL_IND( globToLoc[vtx_loc] ); - ind_blk = indBlk_srcUpd; - } - while (i < usub_rcvd_sz) { - if (computeRcvd) { - vtx = usub_rcvd[i + DIAG_IND]; - nelts = usub_rcvd[i + NELTS_IND]; - i += RCVD_IND; - } - else { - vtx = vtx_loc; - if (vtx_lid == lvtx_lid) - nelts = usub_rcvd_sz - i; - else - nelts = xusub[vtx_lid + 1] - xusub[vtx_lid]; - vtx_lid ++; - vtx_loc ++; - if (ind_blk != EMPTY) - if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) { - ind_blk += 2; - vtx_loc = VInfo->begEndBlks_loc[ind_blk]; - } - } - - prVal = usub_rcvd[i]; - for (k = i; k < i + nelts; k++) { - vtx_elt = usub_rcvd[k]; - if (vtx_elt > prVal) - k = i + nelts; - else { - if (OWNER( globToLoc[vtx_elt]) == iam) { - if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_toUpd_lid; - /* add vtx_elt to the pruned structure */ - if (marker[vtx_elt_lid] == xusubPr[vtx_elt_lid]) { - usubPr[marker[vtx_elt_lid]] = vtx_elt; - marker[vtx_elt_lid] ++; - } - usubPr[marker[vtx_elt_lid]] = vtx; - marker[vtx_elt_lid] ++; - } - } - } - } - i += nelts; - } - /* reset marker array */ - for (i = 0; i < nvtcs_toUpd; i++) - marker[i] = EMPTY; - if (fstVtx_srcUpd != EMPTY) { - vtx_loc = fstVtx_srcUpd; - vtx_lid = LOCAL_IND( globToLoc[vtx_loc] ); - ind_blk = indBlk_srcUpd; - } - i = 0; - while (i < lsub_rcvd_sz) { - if (computeRcvd) { - vtx = lsub_rcvd[i + DIAG_IND]; - nelts = lsub_rcvd[i + NELTS_IND]; - marker[vtx] = i; - i += RCVD_IND; - } - else { - vtx = vtx_loc; - if (vtx_lid == lvtx_lid) - nelts = lsub_rcvd_sz - i; - else - nelts = xlsub[vtx_lid + 1] - xlsub[vtx_lid]; - vtx_lid ++; - marker[vtx] = i; - vtx_loc ++; - if (ind_blk != EMPTY) - if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) { - ind_blk += 2; - vtx_loc = VInfo->begEndBlks_loc[ind_blk]; - } - } - i += nelts; - } - - /* use the pruned structure to update symbolic factorization */ - vtx_lid = fstVtx_toUpd_lid; - vtx_lid += fstVtx_upd; - for (i = fstVtx_upd; i < lstVtx_upd; i++, vtx_lid ++) { - if (xusubPr[i] != xusubPr[i+1]) { - j = xusubPr[i]; - vtx = usubPr[j]; - /* setup marker structure for already existing elements */ - ii = xlsub[vtx_lid]; - while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) { - marker[lsub[ii]] = markl; - ii ++; - } - PS->nops += ii - xlsub[vtx_lid]; - nextl = ii; - for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) { - vtx_elt = usubPr[j]; - ii = marker[vtx_elt]; - if (computeRcvd) { - nelts = lsub_rcvd[ii + NELTS_IND]; - ii += RCVD_IND; - mpnelts = marker[vtx_elt] + nelts + RCVD_IND; - } - else { - vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ); - if (vtx_elt_lid == lvtx_lid) - nelts = lsub_rcvd_sz - ii; - else - nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid]; - mpnelts = marker[vtx_elt] + nelts; - } - - if (!computeL) - marker[vtx] = markl; - PS->nops += mpnelts - ii; - for (ii; ii < mpnelts; ii++) { - elt = lsub_rcvd[ii]; - if (elt >= vtx) { - if (marker[elt] != markl) { - /* add elt to structure of vtx */ - if (nextl >= xlsub[vtx_lid + 1]) { - if (mem_error = - expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, vtx, i, - lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd, - fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL, - &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - if (computeL) { - lsub = Llu_symbfact->lsub; - if (!computeRcvd) - lsub_rcvd = - &(Llu_symbfact->lsub[Llu_symbfact->xlsub[fstVtx_srcUpd_lid]]); - } else { - marker[vtx] = markl; - lsub = Llu_symbfact->usub; - if (!computeRcvd) - lsub_rcvd = - &(Llu_symbfact->usub[Llu_symbfact->xusub[fstVtx_srcUpd_lid]]); - } - } - lsub[nextl] = elt; nextl ++; - marker[elt] = markl; - } - } - } - } - if (nextl < xlsub[vtx_lid+1]) - lsub[nextl] = EMPTY; - markl ++; - if (markl == n) { - /* reset marker array */ - for (j = fstVtx_toUpd; j < n; j++) - marker[j] = EMPTY; - markl = 0; - } - } - } - *pmarkl = markl; - - return 0; -} - -static int_t -dnsUpSeps_symbfact -( - int_t n, - int iam, /* my processor number */ - int szSep, - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t fstVtx_dns, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_nsuper_loc - ) -{ - int_t nextl, nextu, nsuper_loc, curblk_loc, mem_error; - int_t vtx_elt, ind_blk, vtx, k; - int_t *xlsub, *xusub, *lsub, *usub; - int_t fstVtx_blk, fstVtx_blk_lid, vtx_lid, lstVtx_blk, fstVtx_lvl, lstVtx_lvl; - int_t *globToLoc, maxNvtcsPProc; - - /* Initialization */ - xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub; - xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub; - - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - nextl = *p_nextl; - nextu = *p_nextu; - nsuper_loc = *p_nsuper_loc; - curblk_loc = VInfo->curblk_loc; - VInfo->nnz_ainf_loc = 0; - VInfo->nnz_asup_loc = 0; - - if (fstVtx_dns == EMPTY) - fstVtx_blk = VInfo->begEndBlks_loc[curblk_loc]; - else - fstVtx_blk = fstVtx_dns; - if (fstVtx_blk == n) - return 0; - fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - vtx_lid = fstVtx_blk_lid; - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - PS->nDnsUpSeps = 0; - - while (szSep >= 1) { - PS->nDnsUpSeps++; - fstVtx_lvl = fstVtxSep[ind_sizes2]; - lstVtx_lvl = fstVtxSep[ind_sizes2] + sizes[ind_sizes2]; - if (fstVtx_blk > fstVtx_lvl) - vtx_elt = fstVtx_blk; - else - vtx_elt = fstVtx_lvl; - if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, - nextl + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - lsub = Llu_symbfact->lsub; - } - if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, - nextu + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - usub = Llu_symbfact->usub; - } - PS->nops += 2 * (lstVtx_lvl - vtx_elt); - for (; vtx_elt < lstVtx_lvl; vtx_elt++) { - lsub[nextl] = vtx_elt; nextl++; - usub[nextu] = vtx_elt; nextu++; - } - ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2; - ind_sizes1 += szSep; - szSep = szSep / 2; - } - /* delete the diagonal element from the U structure */ - usub[xusub[fstVtx_blk_lid]] = usub[nextu - 1]; - nextu --; - xlsub[fstVtx_blk_lid+1] = nextl; - xusub[fstVtx_blk_lid+1] = nextu; - - vtx_lid = fstVtx_blk_lid; - ind_blk = curblk_loc; - while (ind_blk < 2 * VInfo->nblks_loc) { - if (ind_blk != curblk_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - - for (k = xlsub[fstVtx_blk_lid]; k < xlsub[fstVtx_blk_lid+1]; k++) - if (lsub[k] >= fstVtx_blk) { - lsub[nextl] = lsub[k]; nextl ++; - if (nextl >= MEM_LSUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - lsub = Llu_symbfact->lsub; - } - for (k = xusub[fstVtx_blk_lid]; k < xusub[fstVtx_blk_lid+1]; k++) - if (usub[k] > fstVtx_blk) { - usub[nextu] = usub[k]; nextu ++; - if (nextu >= MEM_USUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0, - USUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - usub = Llu_symbfact->usub; - } - PS->nops += xlsub[fstVtx_blk_lid+1] - xlsub[fstVtx_blk_lid]; - PS->nops += xusub[fstVtx_blk_lid+1] - xusub[fstVtx_blk_lid]; - } - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - Pslu_freeable->supno_loc[vtx_lid] = nsuper_loc; - if (vtx > fstVtx_blk) { - xlsub[vtx_lid] = nextl; - xusub[vtx_lid] = nextu; - } - } - ind_blk += 2; - nsuper_loc ++; - } - - *p_nextl = nextl; - *p_nextu = nextu; - *p_nsuper_loc = nsuper_loc; -/* VInfo->curblk_loc = ind_blk; */ - - return 0; -} - -static int_t -dnsCurSep_symbfact -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int szSep, - int npNode, - int_t rcvd_dnsSep, - int_t *p_nextl, - int_t *p_nextu, - int_t *p_mark, - int_t *p_nsuper_loc, - int_t *marker, /* temporary array of size n */ - MPI_Comm ndCom, - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS - ) -{ - int_t fstVtx_blk, fstVtx_dns, fstVtx_dns_lid, lstVtx_blk, - fstVtx, lstVtx, lstVtx_dns_lid; - int_t ind_blk, i, vtx, vtx_lid, vtx_lid_x, nvtcs_upd, save_cnt, mem_error; - int_t computeL, computeU, vtx_elt, j, cur_blk, snlid, snrep; - int_t *sub, *xsub, *minElt_vtx, *cntelt_vtcs; - int_t mark, next, *x_newelts, *x_newelts_L, *x_newelts_U; - int_t *newelts_L, *newelts_U, *newelts; - int_t *globToLoc, maxNvtcsPProc, lvl; - int_t prval, kmin, kmax, maxElt, ktemp, prpos; - float mem_dnsCS; - - if (!rcvd_dnsSep) - VInfo->curblk_loc += 2; - - computeL = TRUE; computeU = TRUE; - lstVtx_dns_lid = EMPTY; - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx = fstVtxSep[ind_sizes2]; - lstVtx = fstVtx + sizes[ind_sizes2]; - cur_blk = VInfo->curblk_loc; - fstVtx_dns = VInfo->begEndBlks_loc[cur_blk]; - fstVtx_dns_lid = LOCAL_IND( globToLoc[fstVtx_dns] ); - lvl = (int_t) log2( (double) npNode ); - x_newelts_U = NULL; - newelts_L = NULL; - newelts_U = NULL; - mem_dnsCS = 0.; - - PS->nDnsCurSep ++; - - if (CS->rcv_bufSz > n - fstVtx_dns) - minElt_vtx = CS->rcv_buf; - else { - if (!(minElt_vtx = intMalloc_symbfact(n - fstVtx_dns))) - ABORT("Malloc fails for minElt_vtx[]."); - mem_dnsCS += n - fstVtx_dns; - } - - while (computeL || computeU) { - if (computeL) { - sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub; - x_newelts = Llu_symbfact->cntelt_vtcs; - x_newelts_L = x_newelts; - } - else { - sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub; - } - - /* use minElt_vtx to determine starting vertex of each nonzero element */ - for (i = 0; i < n - fstVtx_dns; i++) - minElt_vtx[i] = n; - - ind_blk = cur_blk; - vtx_lid = fstVtx_dns_lid; - nvtcs_upd = 0; - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - nvtcs_upd += lstVtx_blk - fstVtx_blk; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - j = xsub[vtx_lid]; - while (j < xsub[vtx_lid+1] && sub[j] != EMPTY) { - PS->nops ++; - vtx_elt = sub[j] - fstVtx_dns; - if (minElt_vtx[vtx_elt] == n) { - minElt_vtx[vtx_elt] = vtx; - } - j ++; - } - } - } - if (!computeL) { - if (!(x_newelts_U = intMalloc_symbfact(nvtcs_upd + 1))) - ABORT("Malloc fails for x_newelts_U[]."); - mem_dnsCS += nvtcs_upd + 1; - x_newelts = x_newelts_U; - } - else { - /* save the value in cntelt_vtcs[lstVtx_blk_lid] */ - save_cnt = x_newelts[vtx_lid]; - lstVtx_dns_lid = vtx_lid; - } - - MPI_Allreduce (&(minElt_vtx[lstVtx - fstVtx_dns]), &(marker[lstVtx]), - n - lstVtx, mpi_int_t, MPI_MIN, ndCom); - -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) log2( (double) npNode )); - PS->sz_msgsCol += (float) (n - lstVtx); - if (PS->maxsz_msgCol < n - lstVtx) - PS->maxsz_msgCol = n - lstVtx; -#endif - - /* use x_newelts to determine counts of elements starting in each vertex */ - for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) - x_newelts[vtx_lid] = 0; - - for (vtx = lstVtx; vtx < n; vtx++) { - if (marker[vtx] != n) { - vtx_elt = marker[vtx]; - if (OWNER( globToLoc[vtx_elt] ) == iam) { - x_newelts[ LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid ] ++; - } - else { - /* find the first vertex > vtx_elt which belongs to iam */ - ind_blk = cur_blk; - vtx_lid = 0; - while (vtx_elt > VInfo->begEndBlks_loc[ind_blk] && - ind_blk < 2 * VInfo->nblks_loc) { - vtx_lid += VInfo->begEndBlks_loc[ind_blk+1] - - VInfo->begEndBlks_loc[ind_blk]; - ind_blk += 2; - } - if (VInfo->begEndBlks_loc[ind_blk] < lstVtx) { - x_newelts[vtx_lid] ++; - marker[vtx] = VInfo->begEndBlks_loc[ind_blk]; - } - else - marker[vtx] = n; - } - } - } - - /* set up beginning of new elements for each local vtx */ - i = 0; - for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) { - j = x_newelts[vtx_lid]; - x_newelts[vtx_lid] = i; - i += j; - } - x_newelts[vtx_lid] = i; - newelts = NULL; - if (i != 0) { - if (!(newelts = intMalloc_symbfact(x_newelts[vtx_lid]))) - ABORT("Malloc fails for newelts[]."); - mem_dnsCS += x_newelts[vtx_lid]; - - for (vtx = lstVtx; vtx < n; vtx++) { - if (marker[vtx] != n) { - vtx_elt = marker[vtx]; - vtx_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid; - newelts[x_newelts[vtx_lid]] = vtx; - x_newelts[vtx_lid] ++; - } - } - } - /* reset beginning of new elements for each local vertex */ - i = 0; - for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) { - j = x_newelts[vtx_lid]; - x_newelts[vtx_lid] = i; - i = j; - } - - if (computeL == TRUE) { - computeL = FALSE; - newelts_L = newelts; - } - else { - computeU = FALSE; - newelts_U = newelts; - } - } - - for (i = fstVtx_dns; i < n; i++) - marker[i] = EMPTY; - mark = 0; - - /* update vertices */ - prval = n; - ind_blk = cur_blk; - fstVtx_dns = VInfo->begEndBlks_loc[ind_blk]; - vtx_lid = LOCAL_IND( globToLoc[fstVtx_dns] ); - while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && - ind_blk < 2 * VInfo->nblks_loc) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - ind_blk += 2; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) { - vtx_lid_x = vtx_lid - fstVtx_dns_lid; - Llu_symbfact->xlsub[vtx_lid] = *p_nextl; - Llu_symbfact->xusub[vtx_lid] = *p_nextu; - if (vtx == fstVtx_blk || x_newelts_L[vtx_lid_x+1] != x_newelts_L[vtx_lid_x] || - x_newelts_U[vtx_lid_x+1] != x_newelts_U[vtx_lid_x]) { - /* a new supernode starts */ - snlid = vtx_lid; - snrep = vtx; - if (mark + 2 > n) { - /* reset to EMPTY marker array */ - for (i = 0; i < n; i++) - marker[i] = EMPTY; - mark = 0; - } - - computeL = TRUE; - computeU = FALSE; - while (computeL || computeU) { - if (computeL) { - sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub; - x_newelts = x_newelts_L; newelts = newelts_L; - next = *p_nextl; - } - else { - sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub; - x_newelts = x_newelts_U; newelts = newelts_U; - next = *p_nextu; - } - xsub[vtx_lid] = next; - - /* TEST available memory */ - j = x_newelts[vtx_lid_x+1] + lstVtx - vtx; - if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) || - (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, next + j, - computeL, DNS_CURSEP, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - if (computeL) sub = Llu_symbfact->lsub; - else sub = Llu_symbfact->usub; - } - - if (computeL) i = vtx; - else i = vtx+1; - while (i < lstVtx) { - sub[next] = i; next ++; - i ++; - } - PS->nops += x_newelts[vtx_lid_x+1]; - for (i = 0; i < x_newelts[vtx_lid_x+1]; i++) { - vtx_elt = newelts[i]; - sub[next] = vtx_elt; next ++; - if (computeU && vtx_elt < prval - && marker[vtx_elt] == mark-1) - prval = vtx_elt; - marker[vtx_elt] = mark; - } - if (computeL) { - computeL = FALSE; computeU = TRUE; - *p_nextl = next; - } - else { - computeU = FALSE; - *p_nextu = next; - } - mark ++; - } - if (vtx != fstVtx_blk) - (*p_nsuper_loc) ++; - } /* a new supernode starts */ - /* vtx belongs to the curent supernode */ - Pslu_freeable->supno_loc[vtx_lid] = *p_nsuper_loc; - } - (*p_nsuper_loc) ++; - } - - if (ind_blk > 0) { - /* if iam owns blocks of this level */ - i = *p_nextl - Llu_symbfact->xlsub[snlid]; - j = *p_nextu - Llu_symbfact->xusub[snlid]; - - if (VInfo->begEndBlks_loc[ind_blk - 1] == lstVtx && i > 1 && j > 0) { - /* if iam the last processor owning a block of this level */ - computeL = TRUE; computeU = FALSE; - /* prune the structure */ - while (computeL || computeU) { - if (computeL) { - sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub; - next = *p_nextl; - computeL = FALSE; computeU = TRUE; - } - else { - sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub; - next = *p_nextu; - computeU = FALSE; - } - - kmin = xsub[snlid]; - kmax = next - 1; - if (prval != n) { - maxElt = prval; - while (kmin <= kmax) { - /* Do a quicksort-type partition. */ - if (sub[kmax] > prval) - kmax--; - else if (sub[kmin] <= prval) { - kmin++; - } - else { /* kmin does'nt belong to G^s(L), and kmax belongs: - * interchange the two subscripts - */ - ktemp = sub[kmin]; - sub[kmin] = sub[kmax]; - sub[kmax] = ktemp; - kmin ++; - kmax --; - } - if (sub[kmin-1] == prval) - prpos = kmin - 1; - } - } - else { - maxElt = EMPTY; - while (kmin <= kmax) { - /* compute maximum element of L(:, vtx) */ - if (sub[kmin] > maxElt) { - maxElt = sub[kmin]; - prpos = kmin; - } - kmin ++; - } - } - ktemp = sub[xsub[snlid]]; - sub[xsub[snlid]] = maxElt; - sub[prpos] = ktemp; - } - - /* setup snd_interSz information */ - prval = Llu_symbfact->lsub[Llu_symbfact->xlsub[snlid]]; - if (prval >= lstVtx) { - /* this supernode will be send to next layers of the tree */ - while (prval >= lstVtx && szSep != 1) { - ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2; - ind_sizes1 += szSep; - lvl ++; - szSep = szSep / 2; - lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2]; - CS->snd_interSz[lvl] += i + j + 4; - CS->snd_LinterSz[lvl] += i + 2; - if (CS->snd_vtxinter[lvl] == EMPTY) - CS->snd_vtxinter[lvl] = snrep; - } - } - } - } - - /* restore value in cntelt_vtcs */ - if (lstVtx_dns_lid != EMPTY) - Llu_symbfact->cntelt_vtcs[lstVtx_dns_lid] = save_cnt; - *p_mark = mark; - if (minElt_vtx != CS->rcv_buf) - SUPERLU_FREE (minElt_vtx); - SUPERLU_FREE (x_newelts_U); - if (newelts_L) SUPERLU_FREE (newelts_L); - if (newelts_U) SUPERLU_FREE (newelts_U); - if (PS->szDnsSep < mem_dnsCS) - PS->szDnsSep = mem_dnsCS; -} - -/* all processors affected to current node must call this routine - when VInfo->filledSep == FILLED_SEP - This is necessary since subsequent routines called from here use - MPI_allreduce among all processors affected to curent node */ - -static int_t -denseSep_symbfact -( - int rcvd_dnsSep, /* =1 if processor received info that the separator - became dense, - =0 if myPE determined that separator is full */ - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each separator in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int szSep, - int fstP, /* first pe affected current node */ - int lstP, /* last pe affected current node */ - int_t fstVtx_blkCyc, - int_t nblk_loc, /* block number in the block cyclic distribution of current - supernode */ - int_t *p_nextl, - int_t *p_nextu, - int_t *p_mark, - int_t *p_nsuper_loc, - int_t *marker, - MPI_Comm ndCom, - MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS -) -{ - int nprocsLvl, p, prvP, tag; - int_t nmsgsToSnd, nmsgsToRcv; - int_t ind_blk, mem_error; - int_t *rcv_intraLvl; - int_t fstVtx, lstVtx, cur_blk, lstVtx_blk, fstVtx_blk; - int_t *globToLoc, maxNvtcsPProc; - MPI_Status status; - - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx = fstVtxSep[ind_sizes2]; - lstVtx = fstVtx + sizes[ind_sizes2]; - rcv_intraLvl = CS->rcv_intraLvl; - cur_blk = VInfo->curblk_loc; - nprocsLvl = lstP - fstP; - - if (nblk_loc == 0) { - nmsgsToSnd = 2; nmsgsToRcv = 1; - } - else { - nmsgsToSnd = 1; nmsgsToRcv = 0; - if (!rcvd_dnsSep) nmsgsToRcv ++; - } - if (iam == fstP && rcvd_dnsSep && nblk_loc == 1) - nmsgsToRcv ++; - - /* first exchange msgs with all processors affected to current node */ - ind_blk = cur_blk; - while ((nmsgsToSnd || nmsgsToRcv) && VInfo->begEndBlks_loc[ind_blk] < lstVtx) { - tag = (int) (tag_intraLvl + nblk_loc); - if (nmsgsToSnd) { - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - if (lstVtx_blk != lstVtx) { - p = OWNER( globToLoc[lstVtx_blk]); - MPI_Send (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p, - tag, (*symb_comm)); -#if ( PRNTlevel>=1 ) - PS->no_shmSnd += (float) 1; -#endif - } - nmsgsToSnd --; - } - ind_blk += 2; - nblk_loc ++; - tag = tag_intraLvl + nblk_loc; - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - if (nmsgsToRcv && fstVtx_blk < lstVtx) { - if (iam == fstP) tag --; - prvP = OWNER( globToLoc[fstVtx_blk - 1]); - MPI_Recv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP, - tag, (*symb_comm), &status); -#if ( PRNTlevel>=1 ) - PS->no_shmRcvd += (float) 1; -#endif - nmsgsToRcv --; - } - } - - if (VInfo->filledSep == FILLED_SEP) { - if (mem_error = - dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, lstP - fstP, rcvd_dnsSep, p_nextl, - p_nextu, p_mark, p_nsuper_loc, marker, ndCom, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) - return (mem_error); - } - else if (rcvd_dnsSep) - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, EMPTY, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) - return (mem_error); - return 0; -} - - -static int_t -interLvl_symbfact -( - SuperMatrix *A, /* Input - input matrix A */ - int iam, /* Input - my processor number */ - int lvl, /* Input - current level in the separator tree */ - int szSep, /* Input - size of the current separator (node) */ - int fstP, /* Input - first processor assigned to current node */ - int lstP, /* Input - last processor assigned to current node */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int_t *p_nextl, - int_t *p_nextu, - int_t *p_nsuper_loc, - int_t *pmark, /* mark for symbfact */ - int_t *marker, /* temp array used for marking */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - Pslu_freeable_t *Pslu_freeable, - comm_symbfact_t *CS,/* infos on communication data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - psymbfact_stat_t *PS, - MPI_Comm ndComm, - MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */ - ) -{ - MPI_Status *status; - MPI_Request *request_snd, *request_rcv; - - int nprocsLvl, rcvdP, p, filledSep_lvl; - int toSend, toSendL, toSendU; - int_t *rcv_interLvl; - int_t *snd_interLvl, *snd_interLvl1, *snd_interLvl2, - snd_interLvlSz, snd_LinterLvlSz, snd_vtxLvl; - int_t vtx_elt, update_loc, code_err; - int_t *lsub, *xlsub, *usub, *xusub; - int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz; - int_t n, mark, max_rcvSz; - int_t nextl, nextu, ind_blk, vtx_lid, k, count, nelts, - lstVtxLvl_loc, lstVtxLvl_loc_lid, mem_error; - int_t fstVtx_blk, lstVtx_blk, i, j, vtx, prElt_L, prElt_U, - snd_indBlk, prElt_ind; - int_t fstVtxLvl_loc, nvtcsLvl_loc, maxNvtcsPProc, *globToLoc, - fstVtx, lstVtx; - int ind1, nprocsToRcv, nprocsToSnd, ind2, ind_l, ind_u, ij, ik; - int_t req_ind, sent_msgs, req_ind_snd; - int_t initInfo_loc[2], initInfo_gl[2]; - - /* Initialization */ - n = A->ncol; - fstVtx = fstVtxSep[ind_sizes2]; - lstVtx = fstVtx + sizes[ind_sizes2]; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - nprocsLvl = lstP - fstP; - rcv_interLvl = CS->rcv_interLvl; - snd_interLvl = CS->snd_interLvl; - snd_interLvlSz = CS->snd_interSz[lvl]; - snd_LinterLvlSz = CS->snd_LinterSz[lvl]; - snd_vtxLvl = CS->snd_vtxinter[lvl]; - fstVtxLvl_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc]; - nvtcsLvl_loc = VInfo->nvtcsLvl_loc; - request_snd = NULL; - request_rcv = NULL; - status = NULL; - mark = *pmark; - - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - - /* snd_vtxLvl denotes the first vertex from which iam needs - to send data. - snd_interLvlSz denotes maximum size of the send data, - snd_LinterLvlSz denotes send data corresponding to L part */ - - /* determine maximum size of receive buffer and information - on filled sep */ - if (snd_interLvlSz != 0) { - if (snd_LinterLvlSz == 0) - snd_interLvlSz = 0; - if (snd_interLvlSz - snd_LinterLvlSz == 0) - snd_interLvlSz = 0; - } - - initInfo_loc[0] = snd_interLvlSz; - initInfo_loc[1] = (int_t) VInfo->filledSep; - MPI_Allreduce (initInfo_loc, initInfo_gl, 2, - mpi_int_t, MPI_MAX, ndComm); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) log2( (double) nprocsLvl )); - PS->sz_msgsCol += 2; - if (PS->maxsz_msgCol < 2) - PS->maxsz_msgCol = 2; -#endif - max_rcvSz = initInfo_gl[0]; - filledSep_lvl = (int) initInfo_gl[1]; - - if (filledSep_lvl == FILLED_SEPS) { - /* quick return if all upper separators are dense */ - if (VInfo->filledSep != FILLED_SEPS) { - VInfo->filledSep = FILLED_SEPS; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, - fstVtxSep, - EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) - return (mem_error); - } - return 0; - } - - if (max_rcvSz == 0) - /* quick return if no communication necessary */ - return 0; - - /* allocate data for the send buffer */ - if (snd_interLvlSz) - if (CS->snd_bufSz < snd_interLvlSz) { - PS->maxSzBuf += snd_interLvlSz - CS->snd_bufSz; - if (CS->snd_bufSz != 0) - /* not first time allocate memory */ - SUPERLU_FREE (CS->snd_buf); - CS->snd_bufSz = snd_interLvlSz; - if (!(CS->snd_buf = intMalloc_symbfact (snd_interLvlSz))) { - ABORT("Malloc fails for snd_buf[]."); - } - } - - /* snd_interLvl : to which processors the data need to be send - * information setup during the copy of data to be send in the buffer - * rcv_interLvl : from which processors iam receives update data */ - for (p = 2*fstP; p < 2*lstP; p++) - snd_interLvl[p] = EMPTY; - - if (snd_interLvlSz == 0 && nvtcsLvl_loc == 0) { - code_err = MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t, - &(rcv_interLvl[2*fstP]), 2, mpi_int_t, - ndComm); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) log2( (double) nprocsLvl )); - PS->sz_msgsCol += 2; - if (PS->maxsz_msgCol < 2) - PS->maxsz_msgCol = 2; -#endif - return 0; - } - - /* in interLvlInfos, - * obtain from which processors iam receives update information */ - update_loc = FALSE; - nextl = 0; - nextu = snd_LinterLvlSz; - if (snd_interLvlSz != 0) { - /* copy data to be send */ - /* find index block from where to send data */ - ind_blk = VInfo->curblk_loc; - while (snd_vtxLvl < VInfo->begEndBlks_loc[ind_blk]) { - ind_blk -= 2; - } - snd_indBlk = ind_blk; - vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] ); - for (; ind_blk < VInfo->curblk_loc; ind_blk += 2) { - fstVtx_blk = VInfo->begEndBlks_loc[ind_blk]; - if (ind_blk == snd_indBlk) - fstVtx_blk = snd_vtxLvl; - lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1]; - for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) { - toSendL = FALSE; toSendU = FALSE; - if (xlsub[vtx_lid] != xlsub[vtx_lid+1] && - xusub[vtx_lid] != xusub[vtx_lid+1]) { - k = xlsub[vtx_lid]; - prElt_L = lsub[k]; - j = xusub[vtx_lid]; - prElt_U = usub[j]; - if (prElt_L >= fstVtx || prElt_U >= fstVtx) { - if (prElt_L >= fstVtx) - while (lsub[k] <= prElt_L && k < xlsub[vtx_lid + 1]) { - vtx_elt = lsub[k]; - if (vtx_elt >= fstVtx && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_interLvl[2*p] = TRUE; - toSendL = TRUE; - } - else - update_loc = TRUE; - } - k++; - } - if (prElt_U >= fstVtx) - while (usub[j] <= prElt_U && j < xusub[vtx_lid + 1]) { - vtx_elt = usub[j]; - if (vtx_elt >= fstVtx && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_interLvl[2*p+1] = TRUE; - toSendU = TRUE; - } - else - update_loc = TRUE; - } - j ++; - } - if (toSendL || toSendU) { - /* L(:, vtx) and U(vtx, :) will be send to processors */ - CS->snd_buf[nextu + DIAG_IND] = vtx; - nelts = xusub[vtx_lid+1] - xusub[vtx_lid]; - CS->snd_buf[nextu + NELTS_IND] = nelts; - nextu += 2; - for (j = xusub[vtx_lid]; j < xusub[vtx_lid+1]; j++, nextu ++) { - CS->snd_buf[nextu] = usub[j]; - } - CS->snd_buf[nextl + DIAG_IND] = vtx; - nelts = xlsub[vtx_lid+1] - xlsub[vtx_lid]; - CS->snd_buf[nextl + NELTS_IND] = nelts; - nextl += 2; - for (j = xlsub[vtx_lid]; j < xlsub[vtx_lid+1]; j++, nextl ++) { - CS->snd_buf[nextl] = lsub[j]; - } - } - } - } - } - } - lstVtxLvl_loc = vtx; - lstVtxLvl_loc_lid = vtx_lid; - } - - if (nextl == 0 || nextu - snd_LinterLvlSz == 0) { - for (p = 2*fstP; p < 2*lstP; p++) - snd_interLvl[p] = EMPTY; - } - - nprocsToSnd = 0; - for (p = 2*fstP; p < 2*lstP; p +=2) { - if (snd_interLvl[p] != EMPTY || snd_interLvl[p+1] != EMPTY) { - snd_interLvl[p] = nextl; - snd_interLvl[p+1] = nextu - snd_LinterLvlSz; - nprocsToSnd ++; - } - } - - MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t, - &(rcv_interLvl[2*fstP]), 2, mpi_int_t, ndComm); -#if ( PRNTlevel>=1 ) - PS->no_msgsCol += (float) (2 * (int_t) log2( (double) nprocsLvl )); - PS->sz_msgsCol += 2 * nprocsLvl; - if (PS->maxsz_msgCol < 2 * nprocsLvl) - PS->maxsz_msgCol = 2 * nprocsLvl; -#endif - - max_rcvSz = 0; - nprocsToRcv = 0; - for (p = 2*fstP; p < 2*lstP; p +=2) { - CS->ptr_rcvBuf[p] = max_rcvSz; - if (rcv_interLvl[p] != EMPTY) - max_rcvSz += rcv_interLvl[p]; - CS->ptr_rcvBuf[p+1] = max_rcvSz; - if (rcv_interLvl[p+1] != EMPTY) - max_rcvSz += rcv_interLvl[p+1]; - if (rcv_interLvl[p] != EMPTY || rcv_interLvl[p+1] != EMPTY) - nprocsToRcv ++; - } - - /* allocate data for the receive buffer */ - if (CS->rcv_bufSz < max_rcvSz) { - PS->maxSzBuf += max_rcvSz - CS->rcv_bufSz; - if (CS->rcv_bufSz != 0) /* not first time allocate memory */ - SUPERLU_FREE (CS->rcv_buf); - CS->rcv_bufSz = max_rcvSz; - if (!(CS->rcv_buf = intMalloc_symbfact (max_rcvSz))) { - ABORT("Malloc fails for rcv_buf[]."); - } - } - - /* allocate memory for status arrays */ - if (nprocsToSnd) - if ( !(request_snd = (MPI_Request*) - SUPERLU_MALLOC(2 * nprocsToSnd * sizeof(MPI_Request)))) - ABORT("Not enough memory when allocating MPI_Request"); - if (nprocsToRcv) - if ( !(request_rcv = (MPI_Request*) - SUPERLU_MALLOC(2 * nprocsToRcv * sizeof(MPI_Request)))) - ABORT("Not enough memory when allocating MPI_Request"); - if (nprocsToRcv || nprocsToSnd) - if ( !(status = (MPI_Status*) - SUPERLU_MALLOC(2 * (lstP-fstP) * sizeof(MPI_Status)))) - ABORT("Not enough memory when allocating MPI_Request"); - - /* determine if we have to send data */ - i = 0; - for (toSend = fstP, p = 2*fstP; p < 2*lstP; toSend++, p+=2) - if (snd_interLvl[p] != EMPTY && toSend != iam) { - MPI_Isend (CS->snd_buf, nextl, mpi_int_t, toSend, - tag_interLvl_LData, (*symb_comm), &(request_snd[2*i])); - MPI_Isend (&(CS->snd_buf[snd_LinterLvlSz]), - nextu - snd_LinterLvlSz, mpi_int_t, toSend, - tag_interLvl_UData, (*symb_comm), &(request_snd[2*i+1])); - i++; -#if ( PRNTlevel>=1 ) - PS->no_msgsSnd += (float) 2; - PS->sz_msgsSnd += (float) (nextl + nextu - snd_LinterLvlSz); - if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl; - if (PS->maxsz_msgSnd < nextu - snd_LinterLvlSz) - PS->maxsz_msgSnd = nextu - snd_LinterLvlSz; -#endif - } - - if (update_loc) { - /* use own data to update symbolic factorization */ - vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] ); - lsub_rcvd = &(lsub[xlsub[vtx_lid]]); - lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid]; - usub_rcvd = &(usub[xusub[vtx_lid]]); - usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid]; - - mem_error = - rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - - lsub_rcvd = &(Llu_symbfact->lsub[xlsub[vtx_lid]]); - lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid]; - usub_rcvd = &(Llu_symbfact->usub[xusub[vtx_lid]]); - usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid]; - lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub; - mem_error = - rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub; - } - - /* post non-blocking receives for all the incoming messages */ - i = 0; - for (rcvdP = fstP, p = 2*fstP; p < 2*lstP; rcvdP++, p += 2) - if (rcv_interLvl[p] != EMPTY) { - lsub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[p]]); - MPI_Irecv (lsub_rcvd, rcv_interLvl[p], mpi_int_t, rcvdP, - tag_interLvl_LData, (*symb_comm), &(request_rcv[i])); - usub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[p+1]]); - MPI_Irecv (usub_rcvd, rcv_interLvl[p+1], mpi_int_t, rcvdP, - tag_interLvl_UData, (*symb_comm), &(request_rcv[i+1])); - i += 2; -#if ( PRNTlevel>=1 ) - PS->no_msgsRcvd += (float) 2; - PS->sz_msgsRcvd += (float) (rcv_interLvl[p] + rcv_interLvl[p+1]); - if (PS->maxsz_msgRcvd < rcv_interLvl[p]) - PS->maxsz_msgRcvd = rcv_interLvl[p]; - if (PS->maxsz_msgRcvd < rcv_interLvl[p+1]) - PS->maxsz_msgRcvd = rcv_interLvl[p+1]; -#endif - } - - /* wait until messages are received and update local data */ - for (i = 0; i < nprocsToRcv; i++) { - MPI_Waitany (2*nprocsToRcv, request_rcv, &ind1, status); - ij = 0; - for (p = fstP; p < lstP; p++) - if (rcv_interLvl[2*p] != EMPTY) { - if (ij <= ind1 && ind1 < ij+2) { - rcvdP = p; p = lstP; - if (ind1 == ij) ind2 = ij+1; - else ind2 = ind1 - 1; - ind_l = ij; ind_u = ij+1; - } - ij += 2; - } - MPI_Get_count (status, mpi_int_t, &ij); - MPI_Wait (&(request_rcv[ind2]), status); - MPI_Get_count (status, mpi_int_t, &ik); - if (ind1 == ind_l) { - lsub_rcvd_sz = ij; - usub_rcvd_sz = ik; - } else { - lsub_rcvd_sz = ik; - usub_rcvd_sz = ij; - } - lsub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP]]); - usub_rcvd = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP+1]]); - - /* use received data to update symbolic factorization information */ - mem_error = - rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - mem_error = - rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtxLvl_loc, lstVtx, nvtcsLvl_loc, - 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - - if (nprocsToSnd) - MPI_Waitall (2*nprocsToSnd, request_snd, status); - - *pmark = mark; - if (request_snd != NULL) SUPERLU_FREE (request_snd); - if (request_rcv != NULL) SUPERLU_FREE (request_rcv); - if (status != NULL) SUPERLU_FREE (status); -} - - -static void -createComm -( - int iam, /* Input -my processor number */ - int nprocs, /* Input -number of procs for the symbolic factorization */ - MPI_Comm *commLvls, /* Output -communicators for the nodes in the sep tree */ - MPI_Comm *symb_comm - ) -{ - int szSep, i, j, jj, k, *pranks; - int np, npNode, fstP, lstP, p, code_err, ind, col, key; - MPI_Group mpi_new_group, mpi_base_group; - - /* Make a list of the processes in the new communicator. */ - pranks = (int *) SUPERLU_MALLOC( nprocs * sizeof(int) ); - - i = 2 * nprocs - 2; - MPI_Comm_dup ((*symb_comm), &(commLvls[i])); - szSep = 2; - i -= szSep; - - while (i > 0) { - /* for each level in the separator tree */ - npNode = nprocs / szSep; - fstP = 0; - /* for each node in the level */ - for (j = i; j < i + szSep; j++) { - lstP = fstP + npNode; - if (fstP <= iam && iam < lstP) { - ind = j; - key = iam - fstP; - col = fstP; - } - fstP += npNode; - } - MPI_Comm_split ((*symb_comm), col, key, &(commLvls[ind]) ); - szSep *= 2; - i -= szSep; - } - - SUPERLU_FREE (pranks); -} - -static void -intraLvl_symbfact -( - SuperMatrix *A, /* Input - original matrix A */ - int iam, /* Input - my processor number */ - int lvl, /* Input - current level in the separator tree */ - int szSep, /* Input - size of the current separator(node) */ - int ind_sizes1, - int ind_sizes2, - int_t *sizes, /* Input - sizes of each node in the separator tree */ - int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ - int fstP, /* Input - first processor assigned to current node */ - int lstP, /* Input - last processor assigned to current node */ - int_t fstVtx, /* Input - first vertex of current node */ - int_t lstVtx, /* Input - last vertex of current node */ - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS, - int_t *marker, - int_t *p_mark, /* marker used to merge elements of vertices */ - int_t *p_nextl, /* ptr to nextl in lsub structure */ - int_t *p_nextu, /* ptr to nextu in usub structure */ - int_t *p_neltsZr, /* no of artificial zeros introduced so far */ - int_t *p_neltsTotal, /* no of nonzeros (including artificials) - computed so far */ - int_t *p_nsuper_loc, - MPI_Comm ndComm, - MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */ - ) -{ - int nprocsLvl, p, prvP, rcvP; - int toSend, rcvd_prvP, index_req[2]; - int_t fstVtx_loc_lid, fstVtx_loc, vtx, vtxLvl, curblk_loc, denseSep; - int_t fstVtx_blk, fstVtx_blk_lid, lstVtx_blk, lstVtx_blk_lid, tag; - int_t nvtcs_blk, xusub_end, xlsub_end, prv_fstVtx_blk; - int_t n; - int_t *rcv_intraLvl, *snd_intraLvl; - int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz; - int_t nmsgsRcvd, nmsgsTRcv, sz_msg; - int_t nvtcsLvl_loc, nextl, nextu, ind_blk, snd_vtxLvl, maxNeltsVtx_in; - int_t count, vtx_loc, mem_error, lstBlkRcvd; - int_t fstVtx_blk_loc, fstBlk, vtx_lid, prElt, nelts, j, nvtcs_toUpd; - int_t snd_LinterLvlSz, fstVtx_blk_loc_lid, prElt_ind, maxNmsgsToRcv; - int_t *xlsub, *xusub, *lsub, *usub; - int_t *globToLoc, maxNvtcsPProc, nblk_loc, upd_myD, r, fstVtx_blkCyc; - int_t k, prElt_L, prElt_U, vtx_elt, fstVtx_toUpd; - int intSzMsg; - - MPI_Status status[4]; - MPI_Request request[4]; - - /* Initializations */ - lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub; - usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub; - - /* max number of msgs this processor can receive during - intraLvl_symbfact routine */ - maxNmsgsToRcv = (lstVtx - fstVtx) / VInfo->maxSzBlk + 1; - maxNeltsVtx_in = VInfo->maxNeltsVtx; - globToLoc = Pslu_freeable->globToLoc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - n = A->ncol; - nprocsLvl = lstP - fstP; - rcv_intraLvl = CS->rcv_intraLvl; - snd_intraLvl = CS->snd_intraLvl; - nvtcsLvl_loc = VInfo->nvtcsLvl_loc; - nmsgsTRcv = 0; - nmsgsRcvd = 0; - nblk_loc = 0; - nvtcs_toUpd = nvtcsLvl_loc; - fstVtx_blk = fstVtx; - denseSep = FALSE; - - /* determine first vertex that belongs to fstP */ - k = fstVtx; - fstVtx_blkCyc = n; - while (k < lstVtx && fstVtx_blkCyc == n) { - p = OWNER( globToLoc[k] ); - if (p == fstP) - fstVtx_blkCyc = k; - k += VInfo->maxSzBlk; - } - - for (p = fstP; p < lstP; p++) - rcv_intraLvl[p] = 0; - - for (r = 0; r < 3; r++) - request[r] = MPI_REQUEST_NULL; - - fstVtx_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc]; - fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); - vtx = fstVtx_loc; - if (fstVtx_loc >= fstVtx_blkCyc) - nblk_loc = 1; - while (VInfo->begEndBlks_loc[VInfo->curblk_loc] < lstVtx && !VInfo->filledSep) { - CS->snd_intraSz = 0; - CS->snd_LintraSz = 0; - - lstBlkRcvd = FALSE; - prv_fstVtx_blk = fstVtx_blk; - fstVtx_blk = VInfo->begEndBlks_loc[VInfo->curblk_loc]; - lstVtx_blk = VInfo->begEndBlks_loc[VInfo->curblk_loc + 1]; - fstVtx_toUpd = VInfo->begEndBlks_loc[VInfo->curblk_loc + 2]; - fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] ); - lstVtx_blk_lid = LOCAL_IND( globToLoc[lstVtx_blk - 1] + 1); - nvtcs_blk = lstVtx_blk - fstVtx_blk; - nvtcs_toUpd -= nvtcs_blk; - nmsgsTRcv = n; - VInfo->maxNeltsVtx -= fstVtx_blk - prv_fstVtx_blk; - - index_req[0] = EMPTY; - for (r = 0; r < 3; r++) - request[r] = MPI_REQUEST_NULL; - if (fstVtx_blk != fstVtx) { - /* if not the first vertex of the level */ - prvP = OWNER( globToLoc[fstVtx_blk - 1] ); - rcvd_prvP = FALSE; - /* receive info on number messages to receive */ - tag = tag_intraLvl + nblk_loc; - if (iam == fstP) tag --; - - MPI_Irecv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP, - tag, (*symb_comm), &(request[1])); - - while (!rcvd_prvP || nmsgsRcvd < nmsgsTRcv) { - if (index_req[0] != 1) { - MPI_Irecv (&sz_msg, 1, mpi_int_t, - MPI_ANY_SOURCE, tag_intraLvl_szMsg, - (*symb_comm), &(request[0])); - if (sz_msg > INT_MAX) - ABORT("ERROR in intraLvl_symbfact size to send > INT_MAX\n"); - } - MPI_Waitany (2, request, index_req, status); - if (index_req[0] == 1) { - /* receive information on no msgs to receive */ -#if ( PRNTlevel>=1 ) - PS->no_shmRcvd ++; -#endif - rcvd_prvP = TRUE; - nmsgsTRcv = rcv_intraLvl[iam]; - /* if dense separator was detected by one of the - previous processors ... */ - if (nmsgsTRcv > maxNmsgsToRcv) { - VInfo->filledSep = (int) nmsgsTRcv / maxNmsgsToRcv; - nmsgsTRcv = nmsgsTRcv % maxNmsgsToRcv; - } - - if (nmsgsTRcv == nmsgsRcvd) { - /* MPI_Cancel (&(request[0])); */ - MPI_Send (&r, 1, mpi_int_t, iam, - tag_intraLvl_szMsg, (*symb_comm)); - MPI_Wait (&(request[0]), status); - } - } - if (index_req[0] == 0) { - nmsgsRcvd ++; - if (nmsgsTRcv == nmsgsRcvd) lstBlkRcvd = TRUE; - rcvP = status->MPI_SOURCE; - - /* allocate enough space to receive data */ - if (CS->rcv_bufSz < sz_msg) { - PS->maxSzBuf += sz_msg - CS->rcv_bufSz; - if (CS->rcv_bufSz != 0) - /* not first time allocate memory */ - SUPERLU_FREE (CS->rcv_buf); - CS->rcv_bufSz = sz_msg; - if (!(CS->rcv_buf = intMalloc_symbfact (sz_msg))) { - ABORT("Malloc fails for rcv_buf[]."); - } - } - - /* use received data to update symbolic factorization */ - lsub_rcvd = CS->rcv_buf; - MPI_Recv (lsub_rcvd, sz_msg, mpi_int_t, - rcvP, tag_intraLvl_LData, (*symb_comm), status); - MPI_Get_count (status, mpi_int_t, &intSzMsg); - lsub_rcvd_sz = intSzMsg; - usub_rcvd = &(CS->rcv_buf[lsub_rcvd_sz]); - MPI_Recv (usub_rcvd, sz_msg - lsub_rcvd_sz, - mpi_int_t, rcvP, - tag_intraLvl_UData, (*symb_comm), status); - MPI_Get_count (status, mpi_int_t, &intSzMsg); - usub_rcvd_sz = intSzMsg; -#if ( PRNTlevel>=1 ) - PS->no_shmRcvd ++; - PS->no_msgsRcvd += (float) 2; - PS->sz_msgsRcvd += (float) sz_msg; - if (PS->maxsz_msgRcvd < lsub_rcvd_sz) PS->maxsz_msgRcvd = lsub_rcvd_sz; - if (PS->maxsz_msgRcvd < usub_rcvd_sz) PS->maxsz_msgRcvd = usub_rcvd_sz; -#endif - - if (!lstBlkRcvd) { - mem_error = - rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd, - 1, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - mem_error = - rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd, - 0, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - } - } - } - - if (VInfo->filledSep) { - mem_error = - denseSep_symbfact (1, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc, - p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, - ndComm, symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS); - } - else { - /* compute symbolic factorization for this block */ - if (!lstBlkRcvd) { - lsub_rcvd = NULL; usub_rcvd = NULL; - } - - blk_symbfact (A, iam, lvl, - szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - fstVtx_loc, fstVtx_blk, lstVtx_blk, - lsub_rcvd, lsub_rcvd_sz, usub_rcvd, usub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, CS, PS, - marker, p_mark, - p_nextl, p_nextu, p_neltsZr, p_neltsTotal, - p_nsuper_loc); - lsub = Llu_symbfact->lsub; - usub = Llu_symbfact->usub; - - if (lstVtx_blk != lstVtx) { - /* if this is not the last block of the level */ - if (VInfo->filledSep == FILLED_SEPS || - ( VInfo->filledSep == FILLED_SEP && - ((lstVtx - lstVtx_blk > VInfo->maxSzBlk * nprocsLvl && nblk_loc > 0) || - (lstVtx - fstVtx_blkCyc > VInfo->maxSzBlk * nprocsLvl && nblk_loc == 0)))) - /* if current separator is dense and this is not the last block, - then ... */ - denseSep = TRUE; - else - /* separator dense but not enough uncomputed blocks - in the separator to take advantage of it */ - VInfo->filledSep = FALSE; - - if (VInfo->filledSep == FILLED_SEPS) { - for (p = fstP; p < lstP; p++) - rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + rcv_intraLvl[p]; - denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc, - p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, - symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS); - } - else { - /* send blk to next procs and update the rest of my own blocks */ - if (lstBlkRcvd) { - mem_error = - rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 1, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - mem_error = - rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 0, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - - upd_myD = FALSE; - /* determine processors to which send this block - and copy data to be sent */ - for (p = fstP; p < lstP; p++) - snd_intraLvl[p] = FALSE; - nextl = 0; - nextu = nextl + CS->snd_LintraSz; - - /* allocate enough space to receive data */ - if (CS->rcv_bufSz < CS->snd_intraSz) { - PS->maxSzBuf += CS->snd_intraSz - CS->rcv_bufSz; - if (CS->rcv_bufSz != 0) - /* not first time allocate memory */ - SUPERLU_FREE (CS->rcv_buf); - CS->rcv_bufSz = CS->snd_intraSz; - if (!(CS->rcv_buf = intMalloc_symbfact (CS->snd_intraSz))) { - ABORT("Malloc fails for rcv_buf[]."); - } - } - - for (vtx = fstVtx_blk, vtx_lid = fstVtx_blk_lid; - vtx < lstVtx_blk; vtx++, vtx_lid ++) { - toSend = FALSE; - k = xlsub[vtx_lid]; - prElt_L = lsub[k]; - j = xusub[vtx_lid]; - prElt_U = usub[j]; - - if (prElt_L >= lstVtx_blk || prElt_U >= lstVtx_blk) { - if (vtx == lstVtx_blk - 1) { - xlsub_end = *p_nextl; - xusub_end = *p_nextu; - } - else { - xlsub_end = xlsub[vtx_lid + 1]; - xusub_end = xusub[vtx_lid + 1]; - } - if (prElt_L >= lstVtx_blk) { - while (lsub[k] <= prElt_L && k < xlsub_end) { - vtx_elt = lsub[k]; - if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_intraLvl[p] = TRUE; - toSend = TRUE; - } - else { - upd_myD = TRUE; - } - } - k++; - } - } - if (prElt_U >= lstVtx_blk) { - while (usub[j] <= prElt_U && j < xusub_end) { - vtx_elt = usub[j]; - if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) { - p = OWNER( globToLoc[vtx_elt] ); - if (p != iam) { - /* vtx will be send to another processor */ - snd_intraLvl[p] = TRUE; - toSend = TRUE; - } - else { - upd_myD = TRUE; - } - } - j ++; - } - } - if (toSend) { - /* L(:, vtx) and U(vtx, :) will be send to processors */ - nelts = xusub_end - xusub[vtx_lid]; - CS->rcv_buf[nextu + DIAG_IND] = vtx; - CS->rcv_buf[nextu + NELTS_IND] = nelts; - nextu += 2; - for (j = xusub[vtx_lid]; j < xusub_end; j++) { - CS->rcv_buf[nextu] = usub[j]; nextu ++; - } - - nelts = xlsub_end - xlsub[vtx_lid]; - CS->rcv_buf[nextl + DIAG_IND] = vtx; - CS->rcv_buf[nextl + NELTS_IND] = nelts; - nextl += 2; - for (j = xlsub[vtx_lid]; j < xlsub_end; j++) { - CS->rcv_buf[nextl] = lsub[j]; nextl ++; - } - } - } - } - for (p = fstP; p < lstP; p++) - if (snd_intraLvl[p]) - rcv_intraLvl[p] ++; - - if (VInfo->filledSep == FILLED_SEP) { - for (p = fstP; p < lstP; p++) - rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + - rcv_intraLvl[p]; - } - else { - /* send to the owner of the next block info on no of messages */ - p = OWNER( globToLoc[lstVtx_blk] ); - tag = tag_intraLvl + nblk_loc; - - MPI_Isend (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p, - tag, (*symb_comm), request); -#if ( PRNTlevel>=1 ) - PS->no_shmSnd ++; -#endif - } - - /* there is data to be send */ - sz_msg = nextl + nextu - CS->snd_LintraSz; - for (p = fstP; p < lstP; p++) { - if (p != iam && snd_intraLvl[p]) { - MPI_Isend (&sz_msg, 1, mpi_int_t, p, - tag_intraLvl_szMsg, (*symb_comm), &(request[1])); - MPI_Isend (CS->rcv_buf, nextl, mpi_int_t, p, - tag_intraLvl_LData, (*symb_comm), &(request[2])); - MPI_Isend (&(CS->rcv_buf[CS->snd_LintraSz]), - nextu - CS->snd_LintraSz, mpi_int_t, p, - tag_intraLvl_UData, (*symb_comm), &(request[3])); - MPI_Waitall(3, &(request[1]), &(status[1])); -#if ( PRNTlevel>=1 ) - PS->no_shmSnd ++; - PS->no_msgsSnd += (float) 2; - PS->sz_msgsSnd += (float) sz_msg; - if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl; - if (PS->maxsz_msgSnd < nextu - CS->snd_LintraSz) - PS->maxsz_msgSnd = nextu - CS->snd_LintraSz; -#endif - } - } - if (VInfo->filledSep != FILLED_SEP) { - MPI_Wait (request, status); - } - - /* update rest of vertices */ - if (upd_myD) { - lsub_rcvd_sz = (*p_nextl) - xlsub[fstVtx_blk_lid]; - lsub_rcvd = &(lsub[xlsub[fstVtx_blk_lid]]); - usub_rcvd_sz = (*p_nextu) - xusub[fstVtx_blk_lid]; - usub_rcvd = &(usub[xusub[fstVtx_blk_lid]]); - - mem_error = - rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, fstVtx_blk, lstVtx_blk, - EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 1, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - lsub = Llu_symbfact->lsub; - lsub_rcvd = &(lsub[xlsub[fstVtx_blk_lid]]); - mem_error = - rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz, - lsub_rcvd, lsub_rcvd_sz, fstVtx_blk, lstVtx_blk, - EMPTY, - fstVtx_toUpd, lstVtx, nvtcs_toUpd, - 0, p_mark, - marker, Pslu_freeable, Llu_symbfact, VInfo, PS); - usub = Llu_symbfact->usub; - } - if (VInfo->filledSep == FILLED_SEP) - denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc, - p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, - symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS); - } - } - } - VInfo->curblk_loc += 2; - nblk_loc ++; - } - - /* update maxNeltsVtx */ - VInfo->maxNeltsVtx = maxNeltsVtx_in - lstVtx + fstVtx; - - /* if current separator dense, then reset value of filledSep */ - if (VInfo->filledSep == FILLED_SEP) - VInfo->filledSep = FALSE; -} - -static void -symbfact_free -( - int iam, /* Input - my processor number */ - int nprocs, /* Input - number of processors for the symbolic factorization */ - Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */ - comm_symbfact_t *CS - ) -{ - /* free memory corresponding to prune structure */ - if (Llu_symbfact->szLsubPr != 0) - SUPERLU_FREE( Llu_symbfact->lsubPr ); -#if 0 - if (Llu_symbfact->szUsubPr != 0) - SUPERLU_FREE( Llu_symbfact->usubPr ); -#else /* XSL correction */ - if (Llu_symbfact->usubPr != NULL) - SUPERLU_FREE( Llu_symbfact->usubPr ); -#endif - if (Llu_symbfact->xlsubPr != NULL) - SUPERLU_FREE( Llu_symbfact->xlsubPr ); - if (Llu_symbfact->xusubPr != NULL) - SUPERLU_FREE( Llu_symbfact->xusubPr ); - - if (Llu_symbfact->xlsub_rcvd != NULL) - SUPERLU_FREE( Llu_symbfact->xlsub_rcvd); - if (Llu_symbfact->xusub_rcvd != NULL) - SUPERLU_FREE( Llu_symbfact->xusub_rcvd); - - if (Llu_symbfact->cntelt_vtcs != NULL) - SUPERLU_FREE( Llu_symbfact->cntelt_vtcs); - if (Llu_symbfact->cntelt_vtcsA_lvl != NULL) - SUPERLU_FREE( Llu_symbfact->cntelt_vtcsA_lvl); - - if (CS->rcv_bufSz != 0) - SUPERLU_FREE( CS->rcv_buf ); - if (CS->snd_bufSz != 0) - SUPERLU_FREE( CS->snd_buf ); - - SUPERLU_FREE( VInfo->begEndBlks_loc); - SUPERLU_FREE( CS->rcv_interLvl); - SUPERLU_FREE( CS->snd_interLvl); - SUPERLU_FREE( CS->ptr_rcvBuf); - SUPERLU_FREE( CS->rcv_intraLvl); - SUPERLU_FREE( CS->snd_intraLvl); - SUPERLU_FREE( CS->snd_interSz); - SUPERLU_FREE( CS->snd_LinterSz); - SUPERLU_FREE( CS->snd_vtxinter); -} - -static void -estimate_memUsage -( - int_t n, /* Input - order of the matrix */ - int iam, /* Input - my processor number */ - mem_usage_t *symb_mem_usage, - float *p_totalMemLU, /* Output -memory used for symbolic factorization */ - float *p_overestimMem, /* Output -memory allocated during to right looking - overestimation memory usage */ - Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */ - Llu_symbfact_t *Llu_symbfact, /* Input - local L, U data structures */ - vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */ - comm_symbfact_t *CS, - psymbfact_stat_t *PS - ) -{ - int_t nvtcs_loc, lword, nsuper_loc; - float lu_mem, other_mem, overestimMem; - - nvtcs_loc = VInfo->nvtcs_loc; - nsuper_loc = Pslu_freeable->supno_loc[nvtcs_loc]; - lword = sizeof(int_t); - - /* memory for xlsub, xusub, supno_loc, cntelt_vtcs */ - lu_mem = 4.0 * (float) nvtcs_loc * (float) lword; - /* memory for xlsubPr, xusubPr */ - lu_mem += 2.0 * (float) VInfo->maxNvtcsNds_loc * (float) lword; - - if (PS->estimLSz < Llu_symbfact->xlsub[nvtcs_loc]) - PS->estimLSz = Llu_symbfact->xlsub[nvtcs_loc]; - if (PS->estimUSz < Llu_symbfact->xusub[nvtcs_loc]) - PS->estimUSz = Llu_symbfact->xusub[nvtcs_loc]; - - lu_mem += (float) PS->estimLSz * lword; - lu_mem += (float) PS->estimUSz * lword; - lu_mem += (float) PS->maxSzLPr * lword; - lu_mem += (float) PS->maxSzUPr * lword; - lu_mem += (float) PS->szDnsSep * lword; - /* memory for globToLoc, tempArray */ - lu_mem += (float) 2* (float) n * lword; - lu_mem += (float) PS->maxSzBuf * lword; - - overestimMem = (float) (PS->estimLSz - Llu_symbfact->xlsub[nvtcs_loc]) * lword; - overestimMem += (float) (PS->estimUSz - Llu_symbfact->xusub[nvtcs_loc]) * lword; - - *p_totalMemLU = lu_mem; - *p_overestimMem = overestimMem; - - symb_mem_usage->for_lu = (float) ((3 * nvtcs_loc + 2 * nsuper_loc) * lword); - symb_mem_usage->for_lu += (float) (Llu_symbfact->xlsub[nvtcs_loc] * lword); - symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword); - symb_mem_usage->total = lu_mem; -} - - -static int_t * -intMalloc_symbfact(int_t n) -{ - int_t *buf; - if (n == 0) - buf = NULL; - else - buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); - return buf; -} - -static int_t * -intCalloc_symbfact(int_t n) -{ - int_t *buf; - register int_t i; - - if (n == 0) - buf = NULL; - else - buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); - if ( buf ) - for (i = 0; i < n; i++) buf[i] = 0; - return (buf); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,284 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#ifndef __SUPERLU_DIST_PSYMBFACT /* allow multiple inclusions */ -#define __SUPERLU_DIST_PSYMBFACT - -/* - * File name: psymbfact.h - * Purpose: Definitions for parallel symbolic factorization routine - */ - -/* - *-- Structure returned by the symbolic factorization routine - * - * Memory is allocated during parallel symbolic factorization - * symbfact_dist, and freed after dist_symbLU routine. - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xlsub[j] points to the starting - * location of the j-th column in lsub[*]. Note that xlsub - * is indexed by column. - * Storage: row subscripts - * - * (xusub,usub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xusub[j] points to the starting - * location of the j-th row in usub[*]. Note that xusub - * is indexed by rows. - * Storage: column subscripts - * - * (xsup_beg_loc,xsup_end_loc, supno_loc) describes mapping between - * supernode and column, information local to each processor: - * xsup_beg_loc[s] is the leading column of the local s-th supernode. - * xsup_end_loc[s] is the last column of the local s-th supernode. - * supno[i] is the supernode no to which column i belongs; - * - */ - -typedef struct { - int_t *xlsub; /* pointer to the beginning of each column of L */ - int_t *lsub; /* compressed L subscripts, stored by columns */ - int_t szLsub; /* current max size of lsub */ - - int_t *xusub; /* pointer to the beginning of each row of U */ - int_t *usub; /* compressed U subscripts, stored by rows */ - int_t szUsub; /* current max size of usub */ - - int_t *supno_loc; - int_t *xsup_beg_loc; - int_t *xsup_end_loc; - int_t nvtcs_loc; /* number of local vertices */ - int_t *globToLoc; /* global to local indexing */ - int_t maxNvtcsPProc; /* max number of vertices on the processors */ -} Pslu_freeable_t; - - -/* - *-- The structures are determined by symbfact_dist and not used thereafter. - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of L, as described above - * for Pslu_freeable_t. This structure is used internally in symbfact_dist. - * (xusub,usub): usub[*] contains the compressed subscript of U, as described above - * for Pslu_freeable_t. This structure is used internally in symbfact_dist. - * - * (xlsubPr,lsubPr): contains the pruned structure of the graph of - * L, stored by rows as a linked list. - * xlsubPr[j] points to the starting location of the j-th - * row in lsub[*]. - * Storage: original row subscripts. - * It contains the structure corresponding to one node in the sep_tree. - * In each independent domain formed by x vertices, xlsubPr is of size x. - * Allocated and freed during domain_symbolic. - * For the other nodes in the level tree, formed by a maximum of - * maxNvtcsNds_loc, xlsubPr is of size maxNvtcsNds_loc. - * Allocated after domain_symbolic, freed at the end of symbolic_dist - * routine. - * (xusubPr,usubPr): contains the pruned structure of the graph of - * U, stored by columns as a linked list. Similar to (xlsubPr,lsubPr), - * except that it is column oriented. - * - * This is allocated during symbolic factorization symbfact_dist. - */ - -typedef struct { - int_t *xlsubPr; /* pointer to pruned structure of L */ - int_t *lsubPr; /* pruned structure of L */ - int_t szLsubPr; /* size of lsubPr array */ - int_t indLsubPr; /* current index in lsubPr */ - int_t *xusubPr; /* pointer to pruned structure of U */ - int_t *usubPr; /* pruned structure of U */ - int_t szUsubPr; /* size of usubPr array */ - int_t indUsubPr; /* current index in usubPr */ - - int_t *xlsub_rcvd; - int_t *xlsub; /* pointer to structure of L, stored by columns */ - int_t *lsub; /* structure of L, stored by columns */ - int_t szLsub; /* current max size of lsub */ - int_t nextl; /* pointer to current computation in lsub */ - - int_t *xusub_rcvd; /* */ - int_t *xusub; /* pointer to structure of U, stored by rows */ - int_t *usub; /* structure of U, stored by rows */ - int_t szUsub; /* current max size of usub */ - int_t nextu; /* pointer to current computation in usub */ - - int_t *cntelt_vtcs; /* size of column/row for each vertex */ - int_t *cntelt_vtcsA_lvl; /* size of column/row of A for each vertex at the - current level */ - - LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ - int_t no_expand; /* Number of memory expansions */ - int_t no_expand_pr; /* Number of memory expansions of the pruned structures */ - int_t no_expcp; /* Number of memory expansions due to the right looking - overestimation approach */ -} Llu_symbfact_t; - -/* Local information on vertices distribution */ -typedef struct { - int_t maxSzBlk; /* Max no of vertices in a block */ - int_t maxNvtcsNds_loc; /* Max number of vertices of a node distributed on one - processor. The maximum is computed among all the nodes - of the sep arator tree and among all the processors */ - int_t maxNeltsVtx; /* Max number of elements of a vertex, - that is condisering that the matrix is - dense */ - int_t nblks_loc; /* Number of local blocks */ - int_t *begEndBlks_loc; /* Begin and end vertex of each local block. - Array of size 2 * nblks_loc */ - int_t curblk_loc; /* Index of current block in the level under computation */ - int_t nvtcs_loc; /* Number of local vertices distributed on a processor */ - int_t nvtcsLvl_loc; /* Number of local vertices for current - level under computation */ - int filledSep; /* determines if curent or all separators are filled */ - int_t nnz_asup_loc; /* Number of nonzeros in asup not yet consumed. Used during - symbolic factorization routine to determine how much - of xusub, usub is still used to store the input matrix AS */ - int_t nnz_ainf_loc; /* Number of nonzeros in ainf. Similar to nnz_asup_loc. */ - int_t xusub_nextLvl; /* Pointer to usub of the next level */ - int_t xlsub_nextLvl; /* Pointer to lsub of the next level */ - int_t fstVtx_nextLvl; /* First vertex of the next level */ -} vtcsInfo_symbfact_t; - -/* Structure used for redistributing A for the symbolic factorization algorithm */ -typedef struct { - int_t *x_ainf; /* pointers to columns of Ainf */ - int_t *ind_ainf; /* column indices of Ainf */ - int_t *x_asup; /* pointers to rows of Asup */ - int_t *ind_asup; /* row indices of Asup */ -} matrix_symbfact_t; - -typedef struct { - int_t *rcv_interLvl; /* from which processors iam receives data */ - int_t *snd_interLvl; /* to which processors iam sends data */ - int_t *snd_interSz; /* size of data to be send */ - int_t *snd_LinterSz; /* size of data in L part to be send */ - int_t *snd_vtxinter; /* first vertex from where to send data */ - - /* inter level data structures */ - int_t *snd_intraLvl; /* to which processors iam sends data */ - int_t snd_intraSz; /* size of data to send */ - int_t snd_LintraSz; /* size of data to send */ - int_t *rcv_intraLvl; /* from which processors iam receives data */ - int_t *rcv_buf; /* buffer to receive data */ - int_t rcv_bufSz; /* size of the buffer to receive data */ - int_t *snd_buf; /* buffer to send data */ - int_t snd_bufSz; /* size of the buffer to send data */ - int_t *ptr_rcvBuf; /* pointer to rcv_buf, the buffer to receive data */ -} comm_symbfact_t; - -/* relaxation parameters used in the algorithms - for future release */ -/* statistics collected during parallel symbolic factorization */ -typedef struct { - int_t fill_par; /* Estimation of fill. It corresponds to sp_ienv_dist(6) */ - float relax_seps; /* relaxation parameter -not used in this version */ - float relax_curSep; /* relaxation parameter -not used in this version */ - float relax_gen; /* relaxation parameter -not used in this version */ - - /* number of operations performed during parallel symbolic factorization */ - float nops; - - /* no of dense current separators per proc */ - int_t nDnsCurSep; - /* no of dense separators up per proc */ - int_t nDnsUpSeps; - - float no_shmSnd; /* Number of auxiliary messages for send data */ - float no_msgsSnd; /* Number of messages sending data */ - int_t maxsz_msgSnd; /* Max size of messages sending data */ - float sz_msgsSnd; /* Average size of messages sending data */ - float no_shmRcvd; /* Number of auxiliary messages for rcvd data */ - float no_msgsRcvd; /* Number of messages receiving data */ - int_t maxsz_msgRcvd;/* Max size of messages receiving data */ - float sz_msgsRcvd; /* Average size of messages receiving data */ - float no_msgsCol; /* Number of messages sent for estimating size - of rows/columns, setup information - interLvl_symbfact, */ - int_t maxsz_msgCol; /* Average size of messages counted in - no_msgsCol */ - float sz_msgsCol; /* Max size of messages counted in no_msgsCol */ - - /* statistics on fill-in */ - float fill_pelt[6]; - /* - 0 - average fill per elt added during right-looking factorization - 1 - max fill per elt added during right-looking factorization - 2 - number vertices modified during right-looking factorization - 3 - average fill per elt - 4 - max fill per elt - 5 - number vertices computed in upper levels of separator tree - */ - - /* Memory usage */ - int_t estimLSz; /* size of lsub due to right looking overestimation */ - int_t estimUSz; /* size of usub due to right looking overestimation */ - int_t maxSzLPr; /* maximum size of pruned L */ - int_t maxSzUPr; /* maximum size of pruned U */ - int_t maxSzBuf; /* maximum size of the send and receive buffers */ - int_t szDnsSep; /* size of memory used when there are dense separators */ - float allocMem; /* size of the total memory allocated (in bytes) */ -} psymbfact_stat_t; - -/* MACROS */ - -/* - Macros for comptuting the owner of a vertex and the local index - corresponding to a vertex -*/ -#define OWNER(x) ((x) / maxNvtcsPProc) -#define LOCAL_IND(x) ((x) % maxNvtcsPProc) - -/* Macros for computing the available memory in lsub, usub */ -#define MEM_LSUB(Llu, VInfo) (Llu->szLsub - VInfo->nnz_ainf_loc) -#define MEM_USUB(Llu, VInfo) (Llu->szUsub - VInfo->nnz_asup_loc) - -#define tag_interLvl 2 -#define tag_interLvl_LData 0 -#define tag_interLvl_UData 1 -#define tag_intraLvl_szMsg 1000 -#define tag_intraLvl_LData 1001 -#define tag_intraLvl_UData 1002 -/* tag_intraLvl has to be the last tag number */ -#define tag_intraLvl 1003 - -/* - * Index of diagonal element, no of elements preceding each column/row - * of L/U send to another processor - */ -#define DIAG_IND 0 -#define NELTS_IND 1 -#define RCVD_IND 2 - -#define SUCCES_RET 0 /* successful return from a routine */ -#define ERROR_RET 1 /* error return code from a routine */ -#define FILLED_SEP 2 /* the current separator is dense */ -#define FILLED_SEPS 3 /* all the separators situated on the path from the current - separator to the root separator are dense */ - -/* Code for the type of the memory to expand */ -#define USUB_PR 0 -#define LSUB_PR 1 -#define USUB 0 -#define LSUB 1 - -/* - * Code for the type of computation - right looking (RL_SYMB); left - * looking (LL_SYMB); symbolic factorization of an independent domain - * (DOMAIN_SYMB); current separator is dense (DNS_CURSEP); all the - * separators from the current one to the root of the tree are dense - * (DNS_UPSEPS). - */ -#define RL_SYMB 0 -#define DOMAIN_SYMB 1 -#define LL_SYMB 2 -#define DNS_UPSEPS 3 -#define DNS_CURSEP 4 - - -#endif /* __SUPERLU_DIST_PSYMBFACT */ - - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,522 +0,0 @@ - -/* - * -- Distributed symbolic factorization auxialiary routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 - * INRIA France - January 2004 - * Laura Grigori - * - * November 1, 2007 - */ - -#include "superlu_ddefs.h" -#include "psymbfact.h" - -static void -copy_mem_int(int_t howmany, int_t* old, int_t* new) -{ - register int_t i; - for (i = 0; i < howmany; i++) new[i] = old[i]; -} - - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -/************************************************************************/ -static int_t *expand -/************************************************************************/ -( - int_t prev_len, /* length used from previous call */ - int_t min_new_len, /* minimum new length to allocate */ - int_t *prev_mem, /* pointer to the previous memory */ - int_t *p_new_len, /* length of the new memory allocated */ - int_t len_tcopy_fbeg, /* size of the memory to be copied to new store - starting from the beginning of the memory */ - int_t len_tcopy_fend, /* size of the memory to be copied to new store, - starting from the end of the memory */ - psymbfact_stat_t *PS - ) -{ - float exp = 2.0; - float alpha; - int_t *new_mem; - int_t new_len, tries, lword, extra, bytes_to_copy; - - alpha = exp; - lword = sizeof(int_t); - - new_len = alpha * prev_len; - if (min_new_len > 0 && new_len < min_new_len) - new_len = min_new_len; - - new_mem = (void *) SUPERLU_MALLOC(new_len * lword); - PS->allocMem += new_len * lword; - - if (new_mem) { - if (len_tcopy_fbeg != 0) - copy_mem_int(len_tcopy_fbeg, prev_mem, new_mem); - if (len_tcopy_fend != 0) - copy_mem_int(len_tcopy_fend, &(prev_mem[prev_len-len_tcopy_fend]), - &(new_mem[new_len-len_tcopy_fend])); - } - *p_new_len = new_len; - return new_mem; - -} /* EXPAND */ - - -/* - * Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -/************************************************************************/ -int_t psymbfact_LUXpandMem -/************************************************************************/ -( - int_t iam, - int_t n, /* total number of columns */ - int_t vtxXp, /* current vertex */ - int_t next, /* number of elements currently in the factors */ - int_t min_new_len, /* minimum new length to allocate */ - int_t mem_type, /* which type of memory to expand */ - int_t rout_type, /* during which type of factorization */ - int_t free_prev_mem, /* =1 if prev_mem has to be freed */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* modified - global LU data structures */ - vtcsInfo_symbfact_t *VInfo, - psymbfact_stat_t *PS - ) -{ - int_t *new_mem, *prev_mem, *xsub; - /* size of the memory to be copied to new store starting from the - beginning/end of the memory */ - int_t xsub_nextLvl; - int_t exp, prev_xsub_nextLvl, vtxXp_lid; - int_t *globToLoc, maxNvtcsPProc, nvtcs_loc; - int_t fstVtx_nextLvl, fstVtx_nextLvl_lid, vtx_lid, i, j; - int_t len_tcopy_fbeg, len_tcopy_fend, new_len, prev_len; - - exp = 2; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = VInfo->nvtcs_loc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx_nextLvl = VInfo->fstVtx_nextLvl; - vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] ); - len_tcopy_fbeg = next; - if (fstVtx_nextLvl == n) - fstVtx_nextLvl_lid = nvtcs_loc; - else - fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] ); - - if ( mem_type == LSUB ) { - prev_mem = Llu_symbfact->lsub; - prev_len = Llu_symbfact->szLsub; - xsub = Llu_symbfact->xlsub; - if (rout_type == DOMAIN_SYMB) - prev_xsub_nextLvl = xsub[vtxXp_lid+1]; - else - prev_xsub_nextLvl = VInfo->xlsub_nextLvl; - } else if ( mem_type == USUB ) { - prev_mem = Llu_symbfact->usub; - prev_len = Llu_symbfact->szUsub; - xsub = Llu_symbfact->xusub; - if (rout_type == DOMAIN_SYMB) - prev_xsub_nextLvl = xsub[vtxXp_lid+1]; - else - prev_xsub_nextLvl = VInfo->xusub_nextLvl; - } - - len_tcopy_fend = prev_len - prev_xsub_nextLvl; - if (rout_type == DNS_UPSEPS || rout_type == DNS_CURSEP) { - fstVtx_nextLvl = n; - fstVtx_nextLvl_lid = nvtcs_loc; - len_tcopy_fend = 0; - } -#ifdef TEST_SYMB - printf ("Pe[%d] LUXpand mem_t %d vtxXp %d\n", - iam, mem_type, vtxXp); -#endif - new_mem = expand (prev_len, min_new_len, prev_mem, - &new_len, len_tcopy_fbeg, len_tcopy_fend, PS); - if ( !new_mem ) { - fprintf(stderr, "Pe[%d] Can't exp MemType %d: prv_len %d min_new %d new_l %d\n", - iam, mem_type, prev_len, min_new_len, new_len); - return ERROR_RET; - } - - xsub_nextLvl = new_len - len_tcopy_fend; - - /* reset xsub information pointing to A data */ - if (fstVtx_nextLvl != n || rout_type == DOMAIN_SYMB) { - if (rout_type == DOMAIN_SYMB) - vtx_lid = vtxXp_lid + 1; - else { - vtx_lid = fstVtx_nextLvl_lid +1; - } - i = xsub_nextLvl + xsub[vtx_lid] - prev_xsub_nextLvl; - for (; vtx_lid < nvtcs_loc; vtx_lid ++) { - j = xsub[vtx_lid+1] - xsub[vtx_lid]; - xsub[vtx_lid] = i; - i += j; - } - xsub[vtx_lid] = i; - } - - if (free_prev_mem) { - SUPERLU_FREE (prev_mem); - PS->allocMem -= 0; - } - - if ( mem_type == LSUB ) { - Llu_symbfact->lsub = new_mem; - Llu_symbfact->szLsub = new_len; - VInfo->xlsub_nextLvl = xsub_nextLvl; - } else if ( mem_type == USUB ) { - Llu_symbfact->usub = new_mem; - Llu_symbfact->szUsub = new_len; - VInfo->xusub_nextLvl = xsub_nextLvl; - } - - Llu_symbfact->no_expand ++; - return SUCCES_RET; -} - -/* - * Expand the data structures for L and U during the factorization. - * Return value: SUCCES_RET - successful return - * ERROR_RET - error due to a memory alocation failure - */ -/************************************************************************/ -int_t psymbfact_LUXpand -/************************************************************************/ -( - int_t iam, - int_t n, /* total number of columns */ - int_t fstVtxLvl_loc, /* first vertex in the level to update */ - int_t vtxXp, /* current vertex */ - int_t *p_next, /* number of elements currently in the factors */ - int_t min_new_len, /* minimum new length to allocate */ - int_t mem_type, /* which type of memory to expand */ - int_t rout_type, /* during which type of factorization */ - int_t free_prev_mem, /* =1 if free prev_mem memory */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* modified - global LU data structures */ - vtcsInfo_symbfact_t *VInfo, - psymbfact_stat_t *PS - ) -{ - int mem_error; - int_t *new_mem, *prev_mem, *xsub, sz_prev_mem; - /* size of the memory to be copied to new store starting from the - beginning/end of the memory */ - int_t exp, prev_xsub_nextLvl, vtxXp_lid, xsub_nextLvl; - int_t *globToLoc, nvtcs_loc, maxNvtcsPProc; - int_t fstVtx_nextLvl, fstVtx_nextLvl_lid; - int_t i, j, k, vtx_lid, len_texp, nelts, nel; - int_t fstVtxLvl_loc_lid, prev_len, next; - - exp = 2; - next = *p_next; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = VInfo->nvtcs_loc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx_nextLvl = VInfo->fstVtx_nextLvl; - - vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] ); - if (fstVtx_nextLvl == n) - fstVtx_nextLvl_lid = VInfo->nvtcs_loc; - else - fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] ); - if (rout_type == RL_SYMB) - fstVtxLvl_loc_lid = LOCAL_IND( globToLoc[fstVtxLvl_loc] ); - - if ( mem_type == LSUB ) { - xsub = Llu_symbfact->xlsub; - prev_mem = Llu_symbfact->lsub; - prev_xsub_nextLvl = VInfo->xlsub_nextLvl; - sz_prev_mem = Llu_symbfact->szLsub; - } else if ( mem_type == USUB ) { - xsub = Llu_symbfact->xusub; - prev_mem = Llu_symbfact->usub; - prev_xsub_nextLvl = VInfo->xusub_nextLvl; - sz_prev_mem = Llu_symbfact->szUsub; - } -#ifdef TEST_SYMB - printf ("Pe[%d] Expand LU mem_t %d vtxXp %d\n", - iam, mem_type, vtxXp); -#endif - /* Try to expand the size of xsub in the existing memory */ - if (rout_type == RL_SYMB) { - len_texp = 0; - for (vtx_lid = fstVtxLvl_loc_lid; vtx_lid < fstVtx_nextLvl_lid; vtx_lid ++) { - nelts = xsub[vtx_lid+1] - xsub[vtx_lid]; - if (nelts == 0) nelts = 1; - nelts = 2 * nelts; - if (nelts > Llu_symbfact->cntelt_vtcs[vtx_lid]) - nelts = Llu_symbfact->cntelt_vtcs[vtx_lid]; - len_texp += nelts; - } -/* len_texp = 2 * (xsub[fstVtx_nextLvl_lid] - xsub[fstVtxLvl_loc_lid]); */ - prev_len = xsub[fstVtxLvl_loc_lid]; - next = prev_len; - } - else { - nelts = xsub[vtxXp_lid+1] - xsub[vtxXp_lid]; - if (nelts == 0) nelts = 1; - len_texp = xsub[fstVtx_nextLvl_lid] - xsub[vtxXp_lid+1] + - 4 * nelts; - prev_len = xsub[vtxXp_lid]; - } - - if (prev_len + len_texp >= prev_xsub_nextLvl) { - /* not enough memory */ - min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl); - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, - mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact, - VInfo, PS)) - return (mem_error); - if ( mem_type == LSUB ) - new_mem = Llu_symbfact->lsub; - else if ( mem_type == USUB ) - new_mem = Llu_symbfact->usub; - } - else - new_mem = prev_mem; - - if (mem_type == LSUB && PS->estimLSz < (prev_len + len_texp)) - PS->estimLSz = prev_len + len_texp; - if (mem_type == USUB && PS->estimUSz < (prev_len + len_texp)) - PS->estimUSz = prev_len; - - /* expand the space */ - if (rout_type == LL_SYMB) { - i = xsub[vtxXp_lid] + len_texp; - vtx_lid = fstVtx_nextLvl_lid - 1; - for (; vtx_lid > vtxXp_lid; vtx_lid --) { - j = xsub[vtx_lid]; - nel = 0; - while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) { - nel ++; j ++; - } - j = xsub[vtx_lid] + nel - 1; - k = i - (xsub[vtx_lid+1] - xsub[vtx_lid]) + nel - 1; - if (k+1 < i) new_mem[k+1] = EMPTY; - while (j >= xsub[vtx_lid]) { - new_mem[k] = prev_mem[j]; k--; j--; - } - k = i; - i -= (xsub[vtx_lid+1] - xsub[vtx_lid]); - xsub[vtx_lid+1] = k; - } - xsub[vtx_lid+1] = i; - k = *p_next; - if (k < xsub[vtx_lid+1]) - new_mem[k] = EMPTY; - } - - if (rout_type == RL_SYMB) { - *p_next -= xsub[vtxXp_lid]; - i = xsub[fstVtxLvl_loc_lid] + len_texp; - vtx_lid = fstVtx_nextLvl_lid - 1; - for (; vtx_lid >= fstVtxLvl_loc_lid; vtx_lid --) { - nelts = 2 * (xsub[vtx_lid+1] - xsub[vtx_lid]); - if (nelts == 0) nelts = 2; - if (nelts > Llu_symbfact->cntelt_vtcs[vtx_lid]) - nelts = Llu_symbfact->cntelt_vtcs[vtx_lid]; - j = xsub[vtx_lid]; - nel = 0; - while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) { - nel ++; j ++; - } - j = xsub[vtx_lid] + nel - 1; - k = i - nelts + nel - 1; - if (k+1 < i) new_mem[k+1] = EMPTY; - while (j >= xsub[vtx_lid]) { - new_mem[k] = prev_mem[j]; k--; j--; - } - k = i; - i -= nelts; - xsub[vtx_lid+1] = k; - } - *p_next += xsub[vtxXp_lid]; - } - - if (free_prev_mem && new_mem != prev_mem) - SUPERLU_FREE (prev_mem); - Llu_symbfact->no_expcp ++; - - return SUCCES_RET; -} - -/* - * Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -/************************************************************************/ -int_t psymbfact_LUXpand_RL -/************************************************************************/ -( - int_t iam, - int_t n, /* total number of columns */ - int_t vtxXp, /* current vertex */ - int_t next, /* number of elements currently in the factors */ - int_t len_texp, /* length to expand */ - int_t mem_type, /* which type of memory to expand */ - Pslu_freeable_t *Pslu_freeable, - Llu_symbfact_t *Llu_symbfact, /* modified - global LU data structures */ - vtcsInfo_symbfact_t *VInfo, - psymbfact_stat_t *PS - ) -{ - int_t *new_mem, *prev_mem, *xsub, mem_error, sz_prev_mem; - /* size of the memory to be copied to new store starting from the - beginning/end of the memory */ - int_t exp, prev_xsub_nextLvl, vtxXp_lid, xsub_nextLvl; - int_t *globToLoc, nvtcs_loc, maxNvtcsPProc; - int_t fstVtx_nextLvl, fstVtx_nextLvl_lid; - int_t i, j, k, vtx_lid, nel; - int_t fstVtxLvl_loc_lid, prev_len, min_new_len; - -#ifdef TEST_SYMB - printf ("Pe[%d] Expand LU_RL mem_t %d vtxXp %d\n", - iam, mem_type, vtxXp); -#endif - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = VInfo->nvtcs_loc; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - fstVtx_nextLvl = VInfo->fstVtx_nextLvl; - - vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] ); - if (fstVtx_nextLvl == n) - fstVtx_nextLvl_lid = VInfo->nvtcs_loc; - else - fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] ); - - if ( mem_type == LSUB ) { - xsub = Llu_symbfact->xlsub; - prev_mem = Llu_symbfact->lsub; - prev_xsub_nextLvl = VInfo->xlsub_nextLvl; - sz_prev_mem = Llu_symbfact->szLsub; - } else if ( mem_type == USUB ) { - xsub = Llu_symbfact->xusub; - prev_mem = Llu_symbfact->usub; - prev_xsub_nextLvl = VInfo->xusub_nextLvl; - sz_prev_mem = Llu_symbfact->szUsub; - } - else ABORT("Tries to expand nonexisting memory type.\n"); - - /* Try to expand the size of xsub in the existing memory */ - prev_len = xsub[vtxXp_lid]; - - if (prev_len + len_texp >= prev_xsub_nextLvl) { - /* not enough memory */ - min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl); - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, - mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact, - VInfo, PS)) - return (mem_error); - if ( mem_type == LSUB ) - new_mem = Llu_symbfact->lsub; - else if ( mem_type == USUB ) - new_mem = Llu_symbfact->usub; - } - else - new_mem = prev_mem; - - /* expand the space */ - if (mem_type == LSUB && PS->estimLSz < (prev_len + len_texp)) - PS->estimLSz = prev_len + len_texp; - if (mem_type == USUB && PS->estimUSz < (prev_len + len_texp)) - PS->estimUSz = prev_len; - - i = xsub[vtxXp_lid] + len_texp; - vtx_lid = fstVtx_nextLvl_lid - 1; - for (; vtx_lid > vtxXp_lid; vtx_lid --) { - j = xsub[vtx_lid]; - nel = 0; - while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) { - nel ++; j++; - } - j = xsub[vtx_lid] + nel - 1; - k = i - Llu_symbfact->cntelt_vtcs[vtx_lid] + nel - 1; - if (k+1 < i) - new_mem[k+1] = EMPTY; - while (j >= xsub[vtx_lid]) { - new_mem[k] = prev_mem[j]; - k--; j--; - } - k = i; - i -= Llu_symbfact->cntelt_vtcs[vtx_lid]; - xsub[vtx_lid+1] = k; - } - xsub[vtx_lid+1] = i; - k = next; - if (k < xsub[vtx_lid+1]) - new_mem[k] = EMPTY; - - if (new_mem != prev_mem) - SUPERLU_FREE (prev_mem); - Llu_symbfact->no_expcp ++; - - return SUCCES_RET; -} - -/* - * Expand the data structures for L and U pruned during the factorization. - * Return value: SUCCES_RET - successful return - * ERROR_RET - error when run out of space - */ -/************************************************************************/ -int_t psymbfact_prLUXpand -/************************************************************************/ -( - int_t iam, - int_t min_new_len, /* minimum new length to allocate */ - MemType mem_type, /* which type of memory to expand */ - Llu_symbfact_t *Llu_symbfact, /* modified L/U pruned structures */ - psymbfact_stat_t *PS - ) -{ - int_t *prev_mem, *new_mem; - int_t prev_len, new_len, len_tcopy_fbeg; - - if ( mem_type == LSUB_PR ) { - prev_len = Llu_symbfact->szLsubPr; - prev_mem = Llu_symbfact->lsubPr; - len_tcopy_fbeg = Llu_symbfact->indLsubPr; - } else if ( mem_type == USUB_PR ) { - prev_len = Llu_symbfact->szUsubPr; - prev_mem = Llu_symbfact->usubPr; - len_tcopy_fbeg = Llu_symbfact->indUsubPr; - } else ABORT("Tries to expand nonexisting memory type.\n"); - -#ifdef TEST_SYMB - printf ("Pe[%d] Expand prmem prev_len %d min_new_l %d len_tfbeg %d\n", - iam, prev_len, min_new_len, len_tcopy_fbeg); -#endif - - new_mem = expand (prev_len, min_new_len, prev_mem, - &new_len, len_tcopy_fbeg, 0, PS); - - if ( !new_mem ) { - fprintf(stderr, "Can't expand MemType %d: \n", mem_type); - return (ERROR_RET); - } - - Llu_symbfact->no_expand_pr ++; - if ( mem_type == LSUB_PR ) { - Llu_symbfact->lsubPr = new_mem; - Llu_symbfact->szLsubPr = new_len; - } else if ( mem_type == USUB_PR ) { - Llu_symbfact->usubPr = new_mem; - Llu_symbfact->szUsubPr = new_len; - } else ABORT("Tries to expand nonexisting memory type.\n"); - - SUPERLU_FREE (prev_mem); - - return SUCCES_RET; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pxerbla.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pxerbla.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pxerbla.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pxerbla.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - -void pxerbla(char *srname, gridinfo_t *grid, int_t info) -{ - printf("{%4d,%4d}: On entry to %6s, parameter number %2d had an illegal value\n", - MYROW(grid->iam, grid), MYCOL(grid->iam, grid), srname, info); - -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzdistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzdistribute.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzdistribute.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzdistribute.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1043 +0,0 @@ - -#include "superlu_zdefs.h" - -int_t -zReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, - gridinfo_t *grid, int_t *colptr[], int_t *rowind[], - doublecomplex *a[]) -{ -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * Purpose - * ======= - * Re-distribute A on the 2D process mesh. - * - * Arguments - * ========= - * - * A (input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. - * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * colptr (output) int* - * - * rowind (output) int* - * - * a (output) doublecomplex* - * - * Return value - * ============ - * - */ - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; - int_t nnz_loc; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - doublecomplex *aij, **aij_send, *nzval, *dtemp; - doublecomplex *nzval_a; - int iam, it, p, procs; - MPI_Request *send_req; - MPI_Status status; - - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter zReDistribute_A()"); -#endif - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - nnzToRecv = intCalloc_dist(2*procs); - nnzToSend = nnzToRecv + procs; - - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); - - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - - /* Allocate space for storing the triplets after redistribution. */ - if ( k ) { /* count can be zero. */ - if ( !(ia = intMalloc_dist(2*k)) ) - ABORT("Malloc fails for ia[]."); - if ( !(aij = doublecomplexMalloc_dist(k)) ) - ABORT("Malloc fails for aij[]."); - } - ja = ia + k; - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) - ABORT("Malloc fails for ia_send[]."); - if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) - ABORT("Malloc fails for aij_send[]."); - if ( SendCnt ) { /* count can be zero */ - if ( !(index = intMalloc_dist(2*SendCnt)) ) - ABORT("Malloc fails for index[]."); - if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) - ABORT("Malloc fails for nzval[]."); - } - if ( !(ptr_to_send = intCalloc_dist(procs)) ) - ABORT("Malloc fails for ptr_to_send[]."); - if ( maxnnzToRecv ) { /* count can be zero */ - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) - ABORT("Malloc fails for itemp[]."); - if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) - ABORT("Malloc fails for dtemp[]."); - } - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - if ( !(*colptr = intCalloc_dist(n+1)) ) - ABORT("Malloc fails for *colptr[]."); - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - ++(*colptr)[jcol]; /* Count nonzeros in each column */ - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /*assert(jcol 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - if ( SendCnt ) { - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - } - SUPERLU_FREE(ptr_to_send); - if ( maxnnzToRecv ) { - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - } - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. - ------------------------------------------------------------*/ - if ( nnz_loc ) { /* nnz_loc can be zero */ - if ( !(*rowind = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *rowind[]."); - if ( !(*a = doublecomplexMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *a[]."); - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = (*colptr)[0]; - (*colptr)[0] = 0; - for (j = 1; j < n; ++j) { - k += jsize; - jsize = (*colptr)[j]; - (*colptr)[j] = k; - } - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - j = ja[i]; - k = (*colptr)[j]; - (*rowind)[k] = ia[i]; - (*a)[k] = aij[i]; - ++(*colptr)[j]; - } - - /* Reset the column pointers to the beginning of each column */ - for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; - (*colptr)[0] = 0; - - if ( nnz_loc ) { - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - } - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit zReDistribute_A()"); -#endif - -} /* zReDistribute_A */ - -int_t -pzdistribute(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, - gridinfo_t *grid) -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * > 0, working storage required (in bytes). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, - len, len1, nsupc; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - doublecomplex *a; - int_t *asub, *xa; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - doublecomplex *lusup, *uval; /* nonzero values in L and U */ - doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ - doublecomplex zero = {0.0, 0.0}; - int_t ldaspa; /* LDA of SPA */ - int_t mem_use = 0, iword, dword; - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif -#if ( PROFlevel>=1 ) - double t, t_u, t_l; - int_t u_blks; -#endif - - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = (NRformat_loc *) A->Store; - -#if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - dword = sizeof(doublecomplex); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzdistribute()"); -#endif -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - zReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, - grid, &xa, &asub, &a); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf("--------\n" - ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); -#endif - - if ( fact == SamePattern_SameRowPerm ) { - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; -#if ( PRNTlevel>=1 ) - mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; -#endif -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - /* Initialize Uval to zero. */ - for (lb = 0; lb < nrbu; ++lb) { - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - index = Ufstnz_br_ptr[lb]; - if ( index ) { - uval = Unzval_br_ptr[lb]; - len = index[1]; - for (i = 0; i < len; ++i) uval[i] = zero; - } /* if index != NULL */ - } /* for lb ... */ - - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA (for L), or into U directly. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - if ( gb < jb ) { /* in U */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - while ( (k = index[Urb_indptr[lb]]) < jb ) { - /* Skip nonzero values in this block */ - Urb_length[lb] += index[Urb_indptr[lb]+1]; - /* Move pointer to the next block */ - Urb_indptr[lb] += UB_DESCRIPTOR - + SuperSize( k ); - } - /*assert(k == jb);*/ - /* start fstnz */ - istart = Urb_indptr[lb] + UB_DESCRIPTOR; - len = Urb_length[lb]; - fsupc1 = FstBlockC( gb+1 ); - k = j - fsupc; - /* Sum the lengths of the leading columns */ - for (jj = 0; jj < k; ++jj) - len += fsupc1 - index[istart++]; - /*assert(irow>=index[istart]);*/ - uval[len + irow - index[istart]] = a[i]; - } else { /* in L; put in SPA first */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", - t_l, t_u, u_blks, nrbu); -#endif - - } else { - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for ToRecv[]."); - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) - ABORT("Malloc fails for index[]."); -#if ( PRNTlevel>=1 ) - mem_use += k*sizeof(int_t*) + (j + nsupers)*iword; -#endif - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = intCalloc_dist(k)) ) - ABORT("Malloc fails for ToSendD[]."); - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); -#if ( PRNTlevel>=1 ) - mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword; -#endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) ) - ABORT("Malloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Urb_fstnz[lb] = BR_HEADER; - } /* for lb ... */ - - SUPERLU_FREE(Ucbs); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); -#endif -#if ( PRNTlevel>=1 ) - mem_use -= 2*k * iword; -#endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); - /* ------------------------------------------------ */ -#if ( PRNTlevel>=1 ) - mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword; -#endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ -#if ( PRNTlevel>=1 ) - mem_use += 4*k*sizeof(int_t*) + 2*len*iword; -#endif - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } /* for j ... */ - - jbrow = PROW( jb, grid ); - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - kseen = 0; - dense_col = dense; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - bsendx_plist[ljb][pr] == EMPTY ) { - bsendx_plist[ljb][pr] = YES; - ++nbsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - fsupc1 = FstBlockC( gb+1 ); - if (rb_marker[lb] <= jb) { /* First time see - the block */ - rb_marker[lb] = jb + 1; - Urb_indptr[lb] = Urb_fstnz[lb];; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - Urb_indptr[lb] += UB_DESCRIPTOR; - /* Record the first location in index[] of the - next block */ - Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; - len = Urb_indptr[lb];/* Start fstnz in index */ - index[len-1] = 0; - for (k = 0; k < nsupc; ++k) - index[len+k] = fsupc1; - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { /* Already saw the block */ - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - /* Load the numerical values */ - k = fsupc1 - irow; /* No. of nonzeros in segment */ - index[len-1] += k; /* Increment block length in - Descriptor */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (ii = 0; ii < k; ++ii) { - uval[Urb_length[lb]++] = dense_col[irow + ii]; - dense_col[irow + ii] = zero; - } - } /* if myrow == pr ... */ - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { - fsendx_plist[ljb][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - Lrowind_bc_ptr[ljb] = index; - if (!(Lnzval_bc_ptr[ljb] = - doublecomplexMalloc_dist(len*nsupc))) { - fprintf(stderr, "col block %d ", jb); - ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); - } - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - lusup = Lnzval_bc_ptr[ljb]; - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb] = NULL; - Lnzval_bc_ptr[ljb] = NULL; - } /* if nrbl ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - - } /* for jb ... */ - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); -#endif - - } /* else fact != SamePattern_SameRowPerm */ - - if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ - SUPERLU_FREE(asub); - SUPERLU_FREE(a); - } - SUPERLU_FREE(xa); - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit pzdistribute()"); -#endif - - return (mem_use); -} /* PZDISTRIBUTE */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsequ.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,228 +0,0 @@ - -/* - * File name: pzgsequ.c - * History: Modified from LAPACK routine ZGEEQU - */ -#include -#include "superlu_zdefs.h" - -void -pzgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int_t *info, gridinfo_t *grid) -{ -/* - Purpose - ======= - - PZGSEQU computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - - R (output) double*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) double*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) double* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) double* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) double* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= M: the i-th row of A is exactly zero - > M: the (i-M)-th column of A is exactly zero - - GRID (input) gridinof_t* - The 2D process mesh. - ===================================================================== -*/ - - /* Local variables */ - NRformat_loc *Astore; - doublecomplex *Aval; - int i, j, irow, jcol, m_loc; - double rcmin, rcmax; - double bignum, smlnum; - extern double dlamch_(char *); - double tempmax, tempmin; - double *loc_max; - int *r_sizes, *displs; - double *loc_r; - int_t procs; - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NR_loc || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("pzgsequ", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - m_loc = Astore->m_loc; - - /* Get machine constants. */ - smlnum = dlamch_("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[j]) ); - ++irow; - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - - /* Get the global MAX and MIN for R */ - tempmax = rcmax; - tempmin = rcmin; - MPI_Allreduce( &tempmax, &rcmax, - 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempmin, &rcmin, - 1, MPI_DOUBLE, MPI_MIN, grid->comm); - - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - c[jcol] = SUPERLU_MAX( c[jcol], z_abs1(&Aval[j]) * r[irow] ); - } - ++irow; - } - - /* Find the global maximum for c[j] */ - if ( !(loc_max = doubleMalloc_dist(A->ncol))) - ABORT("Malloc fails for loc_max[]."); - for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; - MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm); - SUPERLU_FREE(loc_max); - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* gather R from each process to get the global R. */ - - procs = grid->nprow * grid->npcol; - if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) - ABORT("Malloc fails for r_sizes[]."); - displs = r_sizes + procs; - if ( !(loc_r = doubleMalloc_dist(m_loc))) - ABORT("Malloc fails for loc_r[]."); - j = Astore->fst_row; - for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; - - /* First gather the size of each piece. */ - MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm); - - /* Set up the displacements for allgatherv */ - displs[0] = 0; - for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; - - /* Now gather the actual data */ - MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs, - MPI_DOUBLE, grid->comm); - - SUPERLU_FREE(r_sizes); - SUPERLU_FREE(loc_r); - - return; - -} /* pzgsequ */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,304 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include -#include "superlu_zdefs.h" - - -static void zcreate_msr_matrix(SuperMatrix *, int_t [], int_t, - doublecomplex **, int_t **); -static void zPrintMSRmatrix(int, doublecomplex [], int_t [], gridinfo_t *); - - -int pzgsmv_AXglobal_setup -( - SuperMatrix *A, /* Matrix A permuted by columns (input). - The type of A can be: - Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */ - Glu_persist_t *Glu_persist, /* input */ - gridinfo_t *grid, /* input */ - int_t *m, /* output */ - int_t *update[], /* output */ - doublecomplex *val[], /* output */ - int_t *bindx[], /* output */ - int_t *mv_sup_to_proc /* output */ - ) -{ - int n; - int input_option; - int N_update; /* Number of variables updated on this process (output) */ - int iam = grid->iam; - int nprocs = grid->nprow * grid->npcol; - int_t *xsup = Glu_persist->xsup; - int_t *supno = Glu_persist->supno; - int_t nsupers; - int i, nsup, p, t1, t2, t3; - - - /* Initialize the list of global indices. - * NOTE: the list of global indices must be in ascending order. - */ - n = A->nrow; - input_option = SUPER_LINEAR; - nsupers = supno[n-1] + 1; - -#if ( DEBUGlevel>=2 ) - if ( !iam ) { - PrintInt10("xsup", supno[n-1]+1, xsup); - PrintInt10("supno", n, supno); - } -#endif - - if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on - individual rows. */ - /* Figure out mv_sup_to_proc[] on all processes. */ - for (p = 0; p < nprocs; ++p) { - t1 = n / nprocs; /* Number of rows */ - t2 = n - t1 * nprocs; /* left-over, which will be assigned - to the first t2 processes. */ - if ( p >= t2 ) t2 += (p * t1); /* Starting row number */ - else { /* First t2 processes will get one more row. */ - ++t1; /* Number of rows. */ - t2 = p * t1; /* Starting row. */ - } - /* Make sure the starting and ending rows are at the - supernode boundaries. */ - t3 = t2 + t1; /* Ending row. */ - nsup = supno[t2]; - if ( t2 > xsup[nsup] ) { /* Round up the starting row. */ - t1 -= xsup[nsup+1] - t2; - t2 = xsup[nsup+1]; - } - nsup = supno[t3]; - if ( t3 > xsup[nsup] ) /* Round up the ending row. */ - t1 += xsup[nsup+1] - t3; - t3 = t2 + t1 - 1; - if ( t1 ) { - for (i = supno[t2]; i <= supno[t3]; ++i) { - mv_sup_to_proc[i] = p; -#if ( DEBUGlevel>=3 ) - if ( mv_sup_to_proc[i] == p-1 ) { - fprintf(stderr, - "mv_sup_to_proc conflicts at supno %d\n", i); - exit(-1); - } -#endif - } - } - - if ( iam == p ) { - N_update = t1; - if ( N_update ) { - if ( !(*update = intMalloc_dist(N_update)) ) - ABORT("Malloc fails for update[]"); - } - for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i; -#if ( DEBUGlevel>=3 ) - printf("(%2d) N_update = %4d\t" - "supers %4d to %4d\trows %4d to %4d\n", - iam, N_update, supno[t2], supno[t3], t2, t3); -#endif - } - } /* for p ... */ - } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on - individual supernodes. */ - /* This may cause bad load balance, because the blocks are usually - small in the beginning and large toward the end. */ - t1 = nsupers / nprocs; - t2 = nsupers - t1 * nprocs; /* left-over */ - if ( iam >= t2 ) t2 += (iam * t1); - else { - ++t1; /* Number of blocks. */ - t2 = iam * t1; /* Starting block. */ - } - N_update = xsup[t2+t1] - xsup[t2]; - if ( !(*update = intMalloc_dist(N_update)) ) - ABORT("Malloc fails for update[]"); - for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i; - } - - - /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */ - zcreate_msr_matrix(A, *update, N_update, val, bindx); - -#if ( DEBUGlevel>=2 ) - PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc); - zPrintMSRmatrix(N_update, *val, *bindx, grid); -#endif - - *m = N_update; - return 0; -} /* PZGSMV_AXglobal_SETUP */ - - -/* Create the distributed modified sparse row (MSR) matrix: bindx/val. - * For a submatrix of size m-by-n, the MSR arrays are as follows: - * bindx[0] = m + 1 - * bindx[0..m] = pointer to start of each row - * bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k, - * where, ks = bindx[k], ke = bindx[k+1]-1 - * val[k] = A(k,k), k < m, diagonal elements - * val[m] = not used - * val[ki] = A(k, bindx[ki]), where ks <= ki <= ke - * Both arrays are of length nnz + 1. - */ -static void zcreate_msr_matrix -( - SuperMatrix *A, /* Matrix A permuted by columns (input). - The type of A can be: - Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */ - int_t update[], /* input (local) */ - int_t N_update, /* input (local) */ - doublecomplex **val, /* output */ - int_t **bindx /* output */ -) -{ - int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag; - NCPformat *Astore; - doublecomplex *nzval; - int_t *rowcnt; - doublecomplex zero = {0.0, 0.0}; - - if ( !N_update ) return; - - n = A->ncol; - Astore = A->Store; - nzval = Astore->nzval; - - /* One pass of original matrix A to count nonzeros of each row. */ - if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) ) - ABORT("Malloc fails for rowcnt[]"); - lo = update[0]; - hi = update[N_update-1]; - nnz_local = 0; - nnz_diag = 0; - for (j = 0; j < n; ++j) { - for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { - irow = Astore->rowind[i]; - if ( irow >= lo && irow <= hi ) { - if ( irow != j ) /* Exclude diagonal */ - ++rowcnt[irow - lo]; - else ++nnz_diag; /* Count nonzero diagonal entries */ - ++nnz_local; - } - } - } - - /* Add room for the logical diagonal zeros which are not counted - in nnz_local. */ - nnz_local += (N_update - nnz_diag); - - /* Allocate storage for bindx[] and val[]. */ - if ( !(*val = (doublecomplex *) doublecomplexMalloc_dist(nnz_local+1)) ) - ABORT("Malloc fails for val[]"); - for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */ - if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) ) - ABORT("Malloc fails for bindx[]"); - - /* Set up row pointers. */ - (*bindx)[0] = N_update + 1; - for (j = 1; j <= N_update; ++j) { - (*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1]; - rowcnt[j-1] = (*bindx)[j-1]; - } - - /* One pass of original matrix A to fill in matrix entries. */ - for (j = 0; j < n; ++j) { - for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { - irow = Astore->rowind[i]; - if ( irow >= lo && irow <= hi ) { - if ( irow == j ) /* Diagonal */ - (*val)[irow - lo] = nzval[i]; - else { - irow -= lo; - k = rowcnt[irow]; - (*bindx)[k] = j; - (*val)[k] = nzval[i]; - ++rowcnt[irow]; - } - } - } - } - - SUPERLU_FREE(rowcnt); -} - -/* - * Performs sparse matrix-vector multiplication. - * - val/bindx stores the distributed MSR matrix A - * - X is global - * - ax product is distributed the same way as A - */ -int -pzgsmv_AXglobal(int_t m, int_t update[], doublecomplex val[], int_t bindx[], - doublecomplex X[], doublecomplex ax[]) -{ - int_t i, j, k; - doublecomplex zero = {0.0, 0.0}; - doublecomplex temp; - - if ( m <= 0 ) return; /* number of rows (local) */ - - for (i = 0; i < m; ++i) { - ax[i] = zero; - - for (k = bindx[i]; k < bindx[i+1]; ++k) { - j = bindx[k]; /* column index */ - zz_mult(&temp, &val[k], &X[j]); - z_add(&ax[i], &ax[i], &temp); - } - zz_mult(&temp, &val[i], &X[update[i]]); /* diagonal */ - z_add(&ax[i], &ax[i], &temp); - } -} /* PZGSMV_AXglobal */ - -/* - * Performs sparse matrix-vector multiplication. - * - val/bindx stores the distributed MSR matrix A - * - X is global - * - ax product is distributed the same way as A - */ -int -pzgsmv_AXglobal_abs(int_t m, int_t update[], doublecomplex val[], int_t bindx[], - doublecomplex X[], double ax[]) -{ - int_t i, j, k; - - if ( m <= 0 ) return; /* number of rows (local) */ - - for (i = 0; i < m; ++i) { - ax[i] = 0.0; - for (k = bindx[i]; k < bindx[i+1]; ++k) { - j = bindx[k]; /* column index */ - ax[i] += z_abs1(&val[k]) * z_abs1(&X[j]); - } - ax[i] += z_abs1(&val[i]) * z_abs1(&X[update[i]]); /* diagonal */ - } -} /* PZGSMV_AXglobal_ABS */ - -/* - * Print the local MSR matrix - */ -static void zPrintMSRmatrix -( - int m, /* Number of rows of the submatrix. */ - doublecomplex val[], - int_t bindx[], - gridinfo_t *grid -) -{ - int iam, nnzp1; - - if ( !m ) return; - - iam = grid->iam; - nnzp1 = bindx[m]; - printf("(%2d) MSR submatrix has %d rows -->\n", iam, m); - PrintDoublecomplex("val", nnzp1, val); - PrintInt10("bindx", nnzp1, bindx); -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,372 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_zdefs.h" - -void pzgsmv_init -( - SuperMatrix *A, /* Matrix A permuted by columns (input/output). - The type of A can be: - Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. */ - int_t *row_to_proc, /* Input. Mapping between rows and processes. */ - gridinfo_t *grid, /* Input */ - pzgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ - ) -{ - NRformat_loc *Astore; - int iam, p, procs; - int *SendCounts, *RecvCounts; - int_t i, j, k, l, m, m_loc, n, fst_row, jcol; - int_t TotalIndSend, TotalValSend; - int_t *colind, *rowptr; - int_t *ind_tosend = NULL, *ind_torecv = NULL; - int_t *ptr_ind_tosend, *ptr_ind_torecv; - int_t *extern_start, *spa, *itemp; - doublecomplex *nzval, *val_tosend = NULL, *val_torecv = NULL, t; - MPI_Request *send_req, *recv_req; - MPI_Status status; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pzgsmv_init()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - m = A->nrow; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - colind = Astore->colind; - rowptr = Astore->rowptr; - nzval = Astore->nzval; - if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) - ABORT("Malloc fails for SendCounts[]"); - /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ - RecvCounts = SendCounts + procs; - if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) - ABORT("Malloc fails for ptr_ind_tosend[]"); - ptr_ind_torecv = ptr_ind_tosend + procs + 1; - if ( !(extern_start = intMalloc_dist(m_loc)) ) - ABORT("Malloc fails for extern_start[]"); - for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. - THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. - SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE - LOCAL PART OF X. - THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. - ------------------------------------------------------------*/ - if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ - ABORT("Malloc fails for spa[]"); - for (p = 0; p < procs; ++p) SendCounts[p] = 0; - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = extern_start[i]; - for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ - jcol = colind[j]; - p = row_to_proc[jcol]; - if ( p != iam ) { /* External */ - if ( spa[jcol] == 0 ) { /* First time see this index */ - ++SendCounts[p]; - spa[jcol] = 1; - } - } else { /* Swap to beginning the part of A corresponding - to the local part of X */ - l = colind[k]; - t = nzval[k]; - colind[k] = jcol; - nzval[k] = nzval[j]; - colind[j] = l; - nzval[j] = t; - ++k; - } - } - extern_start[i] = k; - } - - /* ------------------------------------------------------------ - LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. - THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. - ------------------------------------------------------------*/ - /* Build pointers to ind_tosend[]. */ - ptr_ind_tosend[0] = 0; - for (p = 0, TotalIndSend = 0; p < procs; ++p) { - TotalIndSend += SendCounts[p]; /* Total to send. */ - ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; - } -#if 0 - ptr_ind_tosend[iam] = 0; /* Local part of X */ -#endif - if ( TotalIndSend ) { - if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) - ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ - } - - /* Build SPA to aid global to local translation. */ - for (i = 0; i < n; ++i) spa[i] = EMPTY; - for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - if ( spa[jcol] == EMPTY ) { /* First time see this index */ - p = row_to_proc[jcol]; - if ( p == iam ) { /* Local */ - /*assert(jcol>=fst_row);*/ - spa[jcol] = jcol - fst_row; /* Relative position in local X */ - } else { /* External */ - ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ - spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ - ++ptr_ind_tosend[p]; - } - } - } - } - - /* ------------------------------------------------------------ - TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. - THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - colind[j] = spa[jcol]; - } - } - - /* ------------------------------------------------------------ - COMMUNICATE THE EXTERNAL INDICES OF X. - ------------------------------------------------------------*/ - MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, - grid->comm); - - /* Build pointers to ind_torecv[]. */ - ptr_ind_torecv[0] = 0; - for (p = 0, TotalValSend = 0; p < procs; ++p) { - TotalValSend += RecvCounts[p]; /* Total to receive. */ - ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; - } - if ( TotalValSend ) { - if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) - ABORT("Malloc fails for ind_torecv[]"); - } - - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) - ABORT("Malloc fails for recv_req[]."); - recv_req = send_req + procs; - for (p = 0; p < procs; ++p) { - ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ - if ( SendCounts[p] ) { - MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], - mpi_int_t, p, iam, grid->comm, &send_req[p]); - } - if ( RecvCounts[p] ) { - MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], - mpi_int_t, p, p, grid->comm, &recv_req[p]); - } - } - for (p = 0; p < procs; ++p) { - if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); - if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); - } - - /* Allocate storage for the X values to to transferred. */ - if ( TotalIndSend && - !(val_torecv = doublecomplexMalloc_dist(TotalIndSend)) ) - ABORT("Malloc fails for val_torecv[]."); - if ( TotalValSend && - !(val_tosend = doublecomplexMalloc_dist(TotalValSend)) ) - ABORT("Malloc fails for val_tosend[]."); - - gsmv_comm->extern_start = extern_start; - gsmv_comm->ind_tosend = ind_tosend; - gsmv_comm->ind_torecv = ind_torecv; - gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; - gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; - gsmv_comm->SendCounts = SendCounts; - gsmv_comm->RecvCounts = RecvCounts; - gsmv_comm->val_tosend = val_tosend; - gsmv_comm->val_torecv = val_torecv; - gsmv_comm->TotalIndSend = TotalIndSend; - gsmv_comm->TotalValSend = TotalValSend; - - SUPERLU_FREE(spa); - SUPERLU_FREE(send_req); - -#if ( DEBUGlevel>=2 ) - PrintInt10("pzgsmv_init::rowptr", m_loc+1, rowptr); - PrintInt10("pzgsmv_init::extern_start", m_loc, extern_start); -#endif -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgsmv_init()"); -#endif - -} /* PZGSMV_INIT */ - - -/* - * Performs sparse matrix-vector multiplication. - */ -void -pzgsmv -( - int_t abs, /* Input. Do abs(A)*abs(x). */ - SuperMatrix *A_internal, /* Input. Matrix A permuted by columns. - The column indices are translated into - the relative positions in the gathered x-vector. - The type of A can be: - Stype = NR_loc; Dtype = SLU_Z; Mtype = GE. */ - gridinfo_t *grid, /* Input */ - pzgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */ - doublecomplex x[], /* Input. The distributed source vector */ - doublecomplex ax[] /* Output. The distributed destination vector */ -) -{ - NRformat_loc *Astore; - int iam, procs; - int_t i, j, p, m, m_loc, n, fst_row, jcol; - int_t *colind, *rowptr; - int *SendCounts, *RecvCounts; - int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv; - int_t *extern_start, TotalValSend; - doublecomplex *nzval, *val_tosend, *val_torecv; - doublecomplex zero = {0.0, 0.0}, temp; - double *ax_abs = (double *) ax; - MPI_Request *send_req, *recv_req; - MPI_Status status; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pzgsmv()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A_internal->Store; - m = A_internal->nrow; - n = A_internal->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - colind = Astore->colind; - rowptr = Astore->rowptr; - nzval = (doublecomplex *) Astore->nzval; - extern_start = gsmv_comm->extern_start; - ind_torecv = gsmv_comm->ind_torecv; - ptr_ind_tosend = gsmv_comm->ptr_ind_tosend; - ptr_ind_torecv = gsmv_comm->ptr_ind_torecv; - SendCounts = gsmv_comm->SendCounts; - RecvCounts = gsmv_comm->RecvCounts; - val_tosend = (doublecomplex *) gsmv_comm->val_tosend; - val_torecv = (doublecomplex *) gsmv_comm->val_torecv; - TotalValSend = gsmv_comm->TotalValSend; - - /* ------------------------------------------------------------ - COPY THE X VALUES INTO THE SEND BUFFER. - ------------------------------------------------------------*/ - for (i = 0; i < TotalValSend; ++i) { - j = ind_torecv[i] - fst_row; /* Relative index in x[] */ - val_tosend[i] = x[j]; - } - - /* ------------------------------------------------------------ - COMMUNICATE THE X VALUES. - ------------------------------------------------------------*/ - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) - ABORT("Malloc fails for recv_req[]."); - recv_req = send_req + procs; - for (p = 0; p < procs; ++p) { - if ( RecvCounts[p] ) { - MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p], - SuperLU_MPI_DOUBLE_COMPLEX, p, iam, - grid->comm, &send_req[p]); - } - if ( SendCounts[p] ) { - MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p], - SuperLU_MPI_DOUBLE_COMPLEX, p, p, - grid->comm, &recv_req[p]); - } - } - - /* ------------------------------------------------------------ - PERFORM THE ACTUAL MULTIPLICATION. - ------------------------------------------------------------*/ - if ( abs ) { /* Perform abs(A)*abs(x) */ - /* Multiply the local part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - ax_abs[i] = 0.0; - for (j = rowptr[i]; j < extern_start[i]; ++j) { - jcol = colind[j]; - ax_abs[i] += z_abs1(&nzval[j]) * z_abs1(&x[jcol]); - } - } - - for (p = 0; p < procs; ++p) { - if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); - if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); - } - - /* Multiply the external part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - for (j = extern_start[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - ax_abs[i] += z_abs1(&nzval[j]) * z_abs(&val_torecv[jcol]); - } - } - } else { - /* Multiply the local part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - ax[i] = zero; - for (j = rowptr[i]; j < extern_start[i]; ++j) { - jcol = colind[j]; - zz_mult(&temp, &nzval[j], &x[jcol]); - z_add(&ax[i], &ax[i], &temp); - } - } - - for (p = 0; p < procs; ++p) { - if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); - if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); - } - - /* Multiply the external part. */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - for (j = extern_start[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - zz_mult(&temp, &nzval[j], &val_torecv[jcol]); - z_add(&ax[i], &ax[i], &temp); - } - } - } - - SUPERLU_FREE(send_req); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgsmv()"); -#endif - -} /* PZGSMV */ - -void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm) -{ - int_t *it; - doublecomplex *dt; - SUPERLU_FREE(gsmv_comm->extern_start); - if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); - if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); - SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); - SUPERLU_FREE(gsmv_comm->SendCounts); - if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); - if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,445 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include -#include "superlu_zdefs.h" - -/*-- Function prototypes --*/ -static void gather_1rhs_diag_to_all(int_t, doublecomplex [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], doublecomplex [], doublecomplex []); -static void redist_all_to_diag(int_t, doublecomplex [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t [], doublecomplex []); - -void -pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, - int nrhs, double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pzgsrfs_ABXglobal improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc - * are permutation matrices. The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. - * - * NOTE: Currently, A must reside in all processes when calling - * this routine. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pzgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input) doublecomplex* (global) - * The N-by-NRHS right-hand side matrix of the possibly equilibrated - * and row permuted system. - * - * NOTE: Currently, B must reside on all processes when calling - * this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * X (input/output) doublecomplex* (global) - * On entry, the solution matrix X, as computed by PZGSTRS. - * On exit, the improved solution matrix X. - * If DiagScale = COL or BOTH, X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * NOTE: Currently, X must reside on all processes when calling - * this routine. - * - * ldx (input) int (global) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - /* - * Data structures used by matrix-vector multiply routine. - */ - int_t N_update; /* Number of variables updated on this process */ - int_t *update; /* vector elements (global index) updated - on this processor. */ - int_t *bindx; - doublecomplex *val; - int_t *mv_sup_to_proc; /* Supernode to process mapping in - matrix-vector multiply. */ - /*-- end data structures for matrix-vector multiply --*/ - doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col, - *x_trs, *dx_trs; - double *rwork; - int_t notran; - int_t count, ii, j, jj, k, knsupc, lk, lwork, - nprow, nsupers, nz, p; - int i, iam, pkk; - int_t *ilsum, *xsup; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* NEW STUFF */ - int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ - int_t *diag_len; /* Length of the X vector on diagonal processes. */ - - /*-- Function prototypes --*/ - extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *, - doublecomplex *, int, SuperLUStat_t *, int *); - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - xerbla_("pzgsrfs_ABXglobal", &i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - /* Initialization. */ - iam = grid->iam; - nprow = grid->nprow; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - notran = 1; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()"); -#endif - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. number of diag processes = %d\n", num_diag_procs); - PrintInt10("diag_procs", num_diag_procs, diag_procs); - PrintInt10("diag_len", num_diag_procs, diag_len); - } -#endif - - if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for mv_sup_to_proc[]"); - - pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, - &val, &bindx, mv_sup_to_proc); - - i = CEILING( nsupers, nprow ); /* Number of local block rows */ - ii = Llu->ldalsum + i * XK_H; - k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); - jj = SUPERLU_MAX( jj, N_update ); - lwork = N_update /* For ax and R */ - + ii /* For dx_trs */ - + ii /* For x_trs */ - + k /* For b */ - + jj; /* for temp */ - if ( !(work = doublecomplexMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = work; - dx_trs = work + N_update; - x_trs = dx_trs + ii; - b = x_trs + ii; - temp = b + k; - if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) ) - ABORT("Malloc fails for rwork[]"); - -#if ( DEBUGlevel>=2 ) - { - doublecomplex *dwork = doublecomplexMalloc_dist(n); - for (i = 0; i < n; ++i) { - if ( i & 1 ) dwork[i].r = 1.; - else dwork[i].r = 2.; - dwork[i].i = 0.; - } - /* Check correctness of matrix-vector multiply. */ - pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); - PrintDouble5("Mult A*x", N_update, ax); - SUPERLU_FREE(dwork); - } -#endif - - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - - /* Set SAFE1 essentially to be the underflow threshold times the - number of additions in each row. */ - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - - /* Copy X into x on the diagonal processes. */ - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - jj = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; - dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */ - } - } - } - /* Copy B into b distributed the same way as matrix-vector product. */ - if ( N_update ) ii = update[0]; - for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); - - /* Compute residual. */ - for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork); - for (i = 0; i < N_update; ++i) rwork[i] += z_abs1(&b[i]); - - s = 0.0; - for (i = 0; i < N_update; ++i) { - if ( rwork[i] > safe2 ) { - s = SUPERLU_MAX(s, z_abs1(&R[i]) / rwork[i]); - } else if ( rwork[i] != 0.0 ) { - s = SUPERLU_MAX(s, (safe1 + z_abs1(&R[i])) / rwork[i]); - } - /* If temp[i] is exactly 0.0 (computed by PxGSMV), then - we know the true residual also must be exactly 0.0. */ - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - redist_all_to_diag(n, R, Glu_persist, Llu, grid, - mv_sup_to_proc, dx_trs); - pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); - - /* Update solution. */ - for (p = 0; p < num_diag_procs; ++p) - if ( iam == diag_procs[p] ) - for (k = p; k < nsupers; k += num_diag_procs) { - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - knsupc = SuperSize( k ); - for (i = 0; i < knsupc; ++i) - z_add(&x_trs[i + ii], &x_trs[i + ii], - &dx_trs[i + ii]); - } - lstres = berr[j]; - ++count; - /* Transfer x_trs (on diagonal processes) into X - (on all processes). */ - gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, - num_diag_procs, diag_procs, diag_len, - X_col, temp); - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - - /* Deallocate storage used by matrix-vector multiplication. */ - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - if ( N_update ) { - SUPERLU_FREE(update); - SUPERLU_FREE(bindx); - SUPERLU_FREE(val); - } - SUPERLU_FREE(mv_sup_to_proc); - SUPERLU_FREE(work); - SUPERLU_FREE(rwork); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()"); -#endif - -} /* PZGSRFS_ABXGLOBAL */ - - -/* - * r[] is the residual vector distributed the same way as - * matrix-vector product. - */ -static void -redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist, - LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], - doublecomplex work[]) -{ - int_t i, ii, k, lk, lr, nsupers; - int_t *ilsum, *xsup; - int iam, knsupc, psrc, pkk; - MPI_Status status; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - lr = 0; - - for (k = 0; k < nsupers; ++k) { - pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); - psrc = mv_sup_to_proc[k]; - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - if ( iam == psrc ) { - if ( iam != pkk ) { /* Send X component. */ - MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk, - grid->comm ); - } else { /* Local copy. */ - for (i = 0; i < knsupc; ++i) - work[i + ii] = r[i + lr]; - } - lr += knsupc; - } else { - if ( iam == pkk ) { /* Recv X component. */ - MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk, - grid->comm, &status ); - } - } - } -} /* REDIST_ALL_TO_DIAG */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_1rhs_diag_to_all(int_t n, doublecomplex x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - doublecomplex y[], doublecomplex work[]) -{ - int_t i, ii, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii]; - lwork += knsupc; - } - MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork]; - lwork += knsupc; - } - } -} /* GATHER_1RHS_DIAG_TO_ALL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,439 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include -#include "superlu_zdefs.h" - -/*-- Function prototypes --*/ -static void gather_1rhs_diag_to_all(int_t, doublecomplex [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], doublecomplex [], doublecomplex []); -static void redist_all_to_diag(int_t, doublecomplex [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t [], doublecomplex []); - -void -pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, - int nrhs, double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pzgsrfs_ABXglobal improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc - * are permutation matrices. The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. - * - * NOTE: Currently, A must reside in all processes when calling - * this routine. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pzgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input) doublecomplex* (global) - * The N-by-NRHS right-hand side matrix of the possibly equilibrated - * and row permuted system. - * - * NOTE: Currently, B must reside on all processes when calling - * this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * X (input/output) doublecomplex* (global) - * On entry, the solution matrix X, as computed by PZGSTRS. - * On exit, the improved solution matrix X. - * If DiagScale = COL or BOTH, X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * NOTE: Currently, X must reside on all processes when calling - * this routine. - * - * ldx (input) int (global) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - /* - * Data structures used by matrix-vector multiply routine. - */ - int_t N_update; /* Number of variables updated on this process */ - int_t *update; /* vector elements (global index) updated - on this processor. */ - int_t *bindx; - doublecomplex *val; - int_t *mv_sup_to_proc; /* Supernode to process mapping in - matrix-vector multiply. */ - /*-- end data structures for matrix-vector multiply --*/ - doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col, - *x_trs, *dx_trs; - double *rwork; - int_t notran; - int_t count, ii, j, jj, k, knsupc, lk, lwork, - nprow, nsupers, nz, p; - int i, iam, pkk; - int_t *ilsum, *xsup; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* NEW STUFF */ - int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ - int_t *diag_len; /* Length of the X vector on diagonal processes. */ - - /*-- Function prototypes --*/ - extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *, - doublecomplex *, int, SuperLUStat_t *, int *); - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - xerbla_("pzgsrfs_ABXglobal", &i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - /* Initialization. */ - iam = grid->iam; - nprow = grid->nprow; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - notran = 1; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()"); -#endif - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. number of diag processes = %d\n", num_diag_procs); - PrintInt10("diag_procs", num_diag_procs, diag_procs); - PrintInt10("diag_len", num_diag_procs, diag_len); - } -#endif - - if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for mv_sup_to_proc[]"); - - pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, - &val, &bindx, mv_sup_to_proc); - - i = CEILING( nsupers, nprow ); /* Number of local block rows */ - ii = Llu->ldalsum + i * XK_H; - k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); - jj = SUPERLU_MAX( jj, N_update ); - lwork = N_update /* For ax and R */ - + ii /* For dx_trs */ - + ii /* For x_trs */ - + k /* For b */ - + jj; /* for temp */ - if ( !(work = doublecomplexMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = work; - dx_trs = work + N_update; - x_trs = dx_trs + ii; - b = x_trs + ii; - temp = b + k; - if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) ) - ABORT("Malloc fails for rwork[]"); - -#if ( DEBUGlevel>=2 ) - { - doublecomplex *dwork = doublecomplexMalloc_dist(n); - for (i = 0; i < n; ++i) { - if ( i & 1 ) dwork[i].r = 1.; - else dwork[i].r = 2.; - dwork[i].i = 0.; - } - /* Check correctness of matrix-vector multiply. */ - pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); - PrintDouble5("Mult A*x", N_update, ax); - SUPERLU_FREE(dwork); - } -#endif - - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - - /* Copy X into x on the diagonal processes. */ - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - jj = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; - dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */ - } - } - } - /* Copy B into b distributed the same way as matrix-vector product. */ - if ( N_update ) ii = update[0]; - for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); - - /* Compute residual. */ - for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork); - for (i = 0; i < N_update; ++i) rwork[i] += z_abs1(&b[i]); - - s = 0.0; - for (i = 0; i < N_update; ++i) { - if ( rwork[i] > safe2 ) - s = SUPERLU_MAX(s, z_abs1(&R[i]) / rwork[i]); - else - s = SUPERLU_MAX(s, (z_abs1(&R[i])+safe1)/(rwork[i]+safe1)); - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - redist_all_to_diag(n, R, Glu_persist, Llu, grid, - mv_sup_to_proc, dx_trs); - pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); - - /* Update solution. */ - for (p = 0; p < num_diag_procs; ++p) - if ( iam == diag_procs[p] ) - for (k = p; k < nsupers; k += num_diag_procs) { - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - knsupc = SuperSize( k ); - for (i = 0; i < knsupc; ++i) - z_add(&x_trs[i + ii], &x_trs[i + ii], - &dx_trs[i + ii]); - } - lstres = berr[j]; - ++count; - /* Transfer x_trs (on diagonal processes) into X - (on all processes). */ - gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, - num_diag_procs, diag_procs, diag_len, - X_col, temp); - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - - /* Deallocate storage used by matrix-vector multiplication. */ - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - if ( N_update ) { - SUPERLU_FREE(update); - SUPERLU_FREE(bindx); - SUPERLU_FREE(val); - } - SUPERLU_FREE(mv_sup_to_proc); - SUPERLU_FREE(work); - SUPERLU_FREE(rwork); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()"); -#endif - -} /* PZGSRFS_ABXGLOBAL */ - - -/* - * r[] is the residual vector distributed the same way as - * matrix-vector product. - */ -static void -redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist, - LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], - doublecomplex work[]) -{ - int_t i, ii, k, lk, lr, nsupers; - int_t *ilsum, *xsup; - int iam, knsupc, psrc, pkk; - MPI_Status status; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - lr = 0; - - for (k = 0; k < nsupers; ++k) { - pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); - psrc = mv_sup_to_proc[k]; - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - if ( iam == psrc ) { - if ( iam != pkk ) { /* Send X component. */ - MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk, - grid->comm ); - } else { /* Local copy. */ - for (i = 0; i < knsupc; ++i) - work[i + ii] = r[i + lr]; - } - lr += knsupc; - } else { - if ( iam == pkk ) { /* Recv X component. */ - MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk, - grid->comm, &status ); - } - } - } -} /* REDIST_ALL_TO_DIAG */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_1rhs_diag_to_all(int_t n, doublecomplex x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - doublecomplex y[], doublecomplex work[]) -{ - int_t i, ii, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = ilsum[lk] + (lk+1)*XK_H; - for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii]; - lwork += knsupc; - } - MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork]; - lwork += knsupc; - } - } -} /* GATHER_1RHS_DIAG_TO_ALL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,246 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_zdefs.h" - -void -pzgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, - doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, - SOLVEstruct_t *SOLVEstruct, - double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PZGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_zdefs.h for the definition of 'LUstruct_t'. - * - * ScalePermstruct (input) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_defs.h for the definition of 'gridinfo_t'. - * - * B (input) doublecomplex* (local) - * The m_loc-by-NRHS right-hand side matrix of the possibly - * equilibrated system. That is, B may be overwritten by diag(R)*B. - * - * ldb (input) int (local) - * Leading dimension of matrix B. - * - * X (input/output) doublecomplex* (local) - * On entry, the solution matrix Y, as computed by PDGSTRS, of the - * transformed system A1*Y = Pc*Pr*B. where - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X. - * On exit, the improved solution matrix Y. - * - * In order to obtain the solution X to the original system, - * Y should be permutated by Pc^T, and premultiplied by diag(C) - * if DiagScale = COL or BOTH. - * This must be done after this routine is called. - * - * ldx (input) int (local) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * SOLVEstruct (output) SOLVEstruct_t* (global) - * Contains the information for the communication during the - * solution phase. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - doublecomplex *ax, *R, *dx, *temp, *work, *B_col, *X_col; - double *rtemp; - int_t count, i, j, lwork, nz; - int iam; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* Data structures used by matrix-vector multiply routine. */ - pzgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; - NRformat_loc *Astore; - int_t m_loc, fst_row; - - - /* Initialization. */ - Astore = (NRformat_loc *) A->Store; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - iam = grid->iam; - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - pxerbla("PZGSRFS", grid, i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgsrfs()"); -#endif - - lwork = 2 * m_loc; /* For ax/R/dx and temp */ - if ( !(work = doublecomplexMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = dx = work; - temp = ax + m_loc; - rtemp = (double *) temp; - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - - /* Set SAFE1 essentially to be the underflow threshold times the - number of additions in each row. */ - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pzgsmv(0, A, grid, gsmv_comm, X_col, ax); - - /* Compute residual, stored in R[]. */ - for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]); - - /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ - pzgsmv(1, A, grid, gsmv_comm, X_col, temp); - /* NOTE: rtemp is aliased to temp */ - for (i = 0; i < m_loc; ++i) rtemp[i] += z_abs1(&B_col[i]); - - s = 0.0; - for (i = 0; i < m_loc; ++i) { - if ( rtemp[i] > safe2 ) { - s = SUPERLU_MAX(s, z_abs1(&R[i]) / rtemp[i]); - } else if ( rtemp[i] != 0.0 ) { - s = SUPERLU_MAX(s, (safe1 + z_abs1(&R[i])) / rtemp[i]); - } - /* If temp[i] is exactly 0.0 (computed by PxGSMV), then - we know the true residual also must be exactly 0.0. */ - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - pzgstrs(n, LUstruct, ScalePermstruct, grid, - dx, m_loc, fst_row, m_loc, 1, - SOLVEstruct, stat, info); - - /* Update solution. */ - for (i = 0; i < m_loc; ++i) - z_add(&X_col[i], &X_col[i], &dx[i]); - - lstres = berr[j]; - ++count; - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - /* Deallocate storage. */ - SUPERLU_FREE(work); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgsrfs()"); -#endif - -} /* PZGSRFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,240 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_zdefs.h" - -void -pzgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, - doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, - SOLVEstruct_t *SOLVEstruct, - double *berr, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PZGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates - * for the solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * A (input) SuperMatrix* - * The original matrix A, or the scaled A if equilibration was done. - * A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pdgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_zdefs.h for the definition of 'LUstruct_t'. - * - * ScalePermstruct (input) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_defs.h for the definition of 'gridinfo_t'. - * - * B (input) doublecomplex* (local) - * The m_loc-by-NRHS right-hand side matrix of the possibly - * equilibrated system. That is, B may be overwritten by diag(R)*B. - * - * ldb (input) int (local) - * Leading dimension of matrix B. - * - * X (input/output) doublecomplex* (local) - * On entry, the solution matrix Y, as computed by PDGSTRS, of the - * transformed system A1*Y = Pc*Pr*B. where - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X. - * On exit, the improved solution matrix Y. - * - * In order to obtain the solution X to the original system, - * Y should be permutated by Pc^T, and premultiplied by diag(C) - * if DiagScale = COL or BOTH. - * This must be done after this routine is called. - * - * ldx (input) int (local) - * Leading dimension of matrix X. - * - * nrhs (input) int - * Number of right-hand sides. - * - * SOLVEstruct (output) SOLVEstruct_t* (global) - * Contains the information for the communication during the - * solution phase. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics about the refinement steps. - * See util.h for the definition of SuperLUStat_t. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 20 - - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - doublecomplex *ax, *R, *dx, *temp, *work, *B_col, *X_col; - double *rtemp; - int_t count, i, j, lwork, nz; - int iam; - double eps, lstres; - double s, safmin, safe1, safe2; - - /* Data structures used by matrix-vector multiply routine. */ - pzgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; - NRformat_loc *Astore; - int_t m_loc, fst_row; - - - /* Initialization. */ - Astore = (NRformat_loc *) A->Store; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - iam = grid->iam; - - /* Test the input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; - else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; - else if ( nrhs < 0 ) *info = -13; - if (*info != 0) { - i = -(*info); - pxerbla("PZGSRFS", grid, i); - return; - } - - /* Quick return if possible. */ - if ( n == 0 || nrhs == 0 ) { - return; - } - - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgsrfs()"); -#endif - - lwork = 2 * m_loc; /* For ax/R/dx and temp */ - if ( !(work = doublecomplexMalloc_dist(lwork)) ) - ABORT("Malloc fails for work[]"); - ax = R = dx = work; - temp = ax + m_loc; - rtemp = (double *) temp; - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", - eps, anorm, safe1, safe2); -#endif - - /* Do for each right-hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - B_col = &B[j*ldb]; - X_col = &X[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - - /* Matrix-vector multiply. */ - pzgsmv(0, A, grid, gsmv_comm, X_col, ax); - - /* Compute residual, stored in R[]. */ - for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]); - - /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ - pzgsmv(1, A, grid, gsmv_comm, X_col, temp); - /* NOTE: rtemp is aliased to temp */ - for (i = 0; i < m_loc; ++i) rtemp[i] += z_abs1(&B_col[i]); - - s = 0.0; - for (i = 0; i < m_loc; ++i) { - if ( rtemp[i] > safe2 ) - s = SUPERLU_MAX(s, z_abs1(&R[i]) / rtemp[i]); - else - s = SUPERLU_MAX(s, (z_abs1(&R[i])+safe1)/(rtemp[i]+safe1)); - } - MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); - -#if ( PRNTlevel>= 1 ) - if ( !iam ) - printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); -#endif - if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { - /* Compute new dx. */ - pzgstrs(n, LUstruct, ScalePermstruct, grid, - dx, m_loc, fst_row, m_loc, 1, - SOLVEstruct, stat, info); - - /* Update solution. */ - for (i = 0; i < m_loc; ++i) - z_add(&X_col[i], &X_col[i], &dx[i]); - - lstres = berr[j]; - ++count; - } else { - break; - } - } /* end while */ - - stat->RefineSteps = count; - - } /* for j ... */ - - /* Deallocate storage. */ - SUPERLU_FREE(work); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgsrfs()"); -#endif - -} /* PZGSRFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1064 +0,0 @@ - -#include -#include "superlu_zdefs.h" - - -void -pzgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid, - LUstruct_t *LUstruct, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * - * pzgssvx_ABglobal solves a system of linear equations A*X=B, - * by using Gaussian elimination with "static pivoting" to - * compute the LU factorization of A. - * - * Static pivoting is a technique that combines the numerical stability - * of partial pivoting with the scalability of Cholesky (no pivoting), - * to run accurately and efficiently on large numbers of processors. - * - * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed - * description of the parallel algorithms. - * - * Here are the options for using this code: - * - * 1. Independent of all the other options specified below, the - * user must supply - * - * - B, the matrix of right hand sides, and its dimensions ldb and nrhs - * - grid, a structure describing the 2D processor mesh - * - options->IterRefine, which determines whether or not to - * improve the accuracy of the computed solution using - * iterative refinement - * - * On output, B is overwritten with the solution X. - * - * 2. Depending on options->Fact, the user has several options - * for solving A*X=B. The standard option is for factoring - * A "from scratch". (The other options, described below, - * are used when A is sufficiently similar to a previously - * solved problem to save time by reusing part or all of - * the previous factorization.) - * - * - options->Fact = DOFACT: A is factored "from scratch" - * - * In this case the user must also supply - * - * - A, the input matrix - * - * as well as the following options, which are described in more - * detail below: - * - * - options->Equil, to specify how to scale the rows and columns - * of A to "equilibrate" it (to try to reduce its - * condition number and so improve the - * accuracy of the computed solution) - * - * - options->RowPerm, to specify how to permute the rows of A - * (typically to control numerical stability) - * - * - options->ColPerm, to specify how to permute the columns of A - * (typically to control fill-in and enhance - * parallelism during factorization) - * - * - options->ReplaceTinyPivot, to specify how to deal with tiny - * pivots encountered during factorization - * (to control numerical stability) - * - * The outputs returned include - * - * - ScalePermstruct, modified to describe how the input matrix A - * was equilibrated and permuted: - * - ScalePermstruct->DiagScale, indicates whether the rows and/or - * columns of A were scaled - * - ScalePermstruct->R, array of row scale factors - * - ScalePermstruct->C, array of column scale factors - * - ScalePermstruct->perm_r, row permutation vector - * - ScalePermstruct->perm_c, column permutation vector - * - * (part of ScalePermstruct may also need to be supplied on input, - * depending on options->RowPerm and options->ColPerm as described - * later). - * - * - A, the input matrix A overwritten by the scaled and permuted matrix - * Pc*Pr*diag(R)*A*diag(C) - * where - * Pr and Pc are row and columns permutation matrices determined - * by ScalePermstruct->perm_r and ScalePermstruct->perm_c, - * respectively, and - * diag(R) and diag(C) are diagonal scaling matrices determined - * by ScalePermstruct->DiagScale, ScalePermstruct->R and - * ScalePermstruct->C - * - * - LUstruct, which contains the L and U factorization of A1 where - * - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * (Note that A1 = Aout * Pc^T, where Aout is the matrix stored - * in A on output.) - * - * 3. The second value of options->Fact assumes that a matrix with the same - * sparsity pattern as A has already been factored: - * - * - options->Fact = SamePattern: A is factored, assuming that it has - * the same nonzero pattern as a previously factored matrix. In this - * case the algorithm saves time by reusing the previously computed - * column permutation vector stored in ScalePermstruct->perm_c - * and the "elimination tree" of A stored in LUstruct->etree. - * - * In this case the user must still specify the following options - * as before: - * - * - options->Equil - * - options->RowPerm - * - options->ReplaceTinyPivot - * - * but not options->ColPerm, whose value is ignored. This is because the - * previous column permutation from ScalePermstruct->perm_c is used as - * input. The user must also supply - * - * - A, the input matrix - * - ScalePermstruct->perm_c, the column permutation - * - LUstruct->etree, the elimination tree - * - * The outputs returned include - * - * - A, the input matrix A overwritten by the scaled and permuted matrix - * as described above - * - ScalePermstruct, modified to describe how the input matrix A was - * equilibrated and row permuted - * - LUstruct, modified to contain the new L and U factors - * - * 4. The third value of options->Fact assumes that a matrix B with the same - * sparsity pattern as A has already been factored, and where the - * row permutation of B can be reused for A. This is useful when A and B - * have similar numerical values, so that the same row permutation - * will make both factorizations numerically stable. This lets us reuse - * all of the previously computed structure of L and U. - * - * - options->Fact = SamePattern_SameRowPerm: A is factored, - * assuming not only the same nonzero pattern as the previously - * factored matrix B, but reusing B's row permutation. - * - * In this case the user must still specify the following options - * as before: - * - * - options->Equil - * - options->ReplaceTinyPivot - * - * but not options->RowPerm or options->ColPerm, whose values are ignored. - * This is because the permutations from ScalePermstruct->perm_r and - * ScalePermstruct->perm_c are used as input. - * - * The user must also supply - * - * - A, the input matrix - * - ScalePermstruct->DiagScale, how the previous matrix was row and/or - * column scaled - * - ScalePermstruct->R, the row scalings of the previous matrix, if any - * - ScalePermstruct->C, the columns scalings of the previous matrix, - * if any - * - ScalePermstruct->perm_r, the row permutation of the previous matrix - * - ScalePermstruct->perm_c, the column permutation of the previous - * matrix - * - all of LUstruct, the previously computed information about L and U - * (the actual numerical values of L and U stored in - * LUstruct->Llu are ignored) - * - * The outputs returned include - * - * - A, the input matrix A overwritten by the scaled and permuted matrix - * as described above - * - ScalePermstruct, modified to describe how the input matrix A was - * equilibrated - * (thus ScalePermstruct->DiagScale, R and C may be modified) - * - LUstruct, modified to contain the new L and U factors - * - * 5. The fourth and last value of options->Fact assumes that A is - * identical to a matrix that has already been factored on a previous - * call, and reuses its entire LU factorization - * - * - options->Fact = Factored: A is identical to a previously - * factorized matrix, so the entire previous factorization - * can be reused. - * - * In this case all the other options mentioned above are ignored - * (options->Equil, options->RowPerm, options->ColPerm, - * options->ReplaceTinyPivot) - * - * The user must also supply - * - * - A, the unfactored matrix, only in the case that iterative refinment - * is to be done (specifically A must be the output A from - * the previous call, so that it has been scaled and permuted) - * - all of ScalePermstruct - * - all of LUstruct, including the actual numerical values of L and U - * - * all of which are unmodified on output. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following fields should be defined for this structure: - * - * o Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorized based on the previous history. - * - * = DOFACT: The matrix A will be factorized from scratch. - * Inputs: A - * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * all of ScalePermstruct - * all of LUstruct - * - * = SamePattern: the matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the elimination tree - * LUstruct->etree - * Inputs: A - * options->Equil, RowPerm, ReplaceTinyPivot - * ScalePermstruct->perm_c - * LUstruct->etree - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * rest of ScalePermstruct (DiagScale, R, C, perm_r) - * rest of LUstruct (GLU_persist, Llu) - * - * = SamePattern_SameRowPerm: the matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * Inputs: A - * options->Equil, ReplaceTinyPivot - * all of ScalePermstruct - * all of LUstruct - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * modified LUstruct->Llu - * = FACTORED: the matrix A is already factored. - * Inputs: all of ScalePermstruct - * all of LUstruct - * - * o Equil (yes_no_t) - * Specifies whether to equilibrate the system. - * = NO: no equilibration. - * = YES: scaling factors are computed to equilibrate the system: - * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. - * Whether or not the system will be equilibrated depends - * on the scaling of the matrix A, but if equilibration is - * used, A is overwritten by diag(R)*A*diag(C) and B by - * diag(R)*B. - * - * o RowPerm (rowperm_t) - * Specifies how to permute rows of the matrix A. - * = NATURAL: use the natural ordering. - * = LargeDiag: use the Duff/Koster algorithm to permute rows of - * the original matrix to make the diagonal large - * relative to the off-diagonal. - * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r - * input by the user. - * - * o ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering. - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. - * = MMD_ATA: minimum degree ordering on structure of A'*A. - * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. - * - * o ReplaceTinyPivot (yes_no_t) - * = NO: do not modify pivots - * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during - * LU factorization. - * - * o IterRefine (IterRefine_t) - * Specifies how to perform iterative refinement. - * = NO: no iterative refinement. - * = DOUBLE: accumulate residual in double precision. - * = EXTRA: accumulate residual in extra precision. - * - * NOTE: all options must be indentical on all processes when - * calling this routine. - * - * A (input/output) SuperMatrix* - * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A must be: - * Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. That is, A is stored in - * compressed column format (also known as Harwell-Boeing format). - * See supermatrix.h for the definition of 'SuperMatrix'. - * This routine only handles square A, however, the LU factorization - * routine pzgstrf can factorize rectangular matrices. - * On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C), - * depending on ScalePermstruct->DiagScale, options->RowPerm and - * options->colpem: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by - * diag(R)*A*diag(C). - * if options->RowPerm != NATURAL, A is further overwritten by - * Pr*diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by - * Pc*Pr*diag(R)*A*diag(C). - * If all the above condition are true, the LU decomposition is - * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. - * - * NOTE: Currently, A must reside in all processes when calling - * this routine. - * - * ScalePermstruct (input/output) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * It contains the following fields: - * - * o DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: no equilibration. - * = ROW: row equilibration, i.e., A was premultiplied by - * diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied - * by diag(C). - * = BOTH: both row and column equilibration, i.e., A was - * replaced by diag(R)*A*diag(C). - * If options->Fact = FACTORED or SamePattern_SameRowPerm, - * DiagScale is an input argument; otherwise it is an output - * argument. - * - * o perm_r (int*) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->RowPerm = MY_PERMR, or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument; otherwise it is an output argument. - * - * o perm_c (int*) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern - * or options->Fact = SamePattern_SameRowPerm, perm_c is an - * input argument; otherwise, it is an output argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree - * is already in postorder. - * - * o R (double*) dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by - * diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is - * an input argument; otherwise, R is an output argument. - * - * o C (double*) dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by - * diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is - * an input argument; otherwise, C is an output argument. - * - * B (input/output) doublecomplex* - * On entry, the right-hand side matrix of dimension (A->nrow, nrhs). - * On exit, the solution matrix if info = 0; - * - * NOTE: Currently, B must reside in all processes when calling - * this routine. - * - * ldb (input) int (global) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * The number of right-hand sides. - * If nrhs = 0, only LU decomposition is performed, the forward - * and back substitutions are skipped. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_zdefs.h for the definition of 'gridinfo_t'. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * It contains the following fields: - * - * o etree (int*) dimension (A->ncol) - * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol. - * It is computed in sp_colorder() during the first factorization, - * and is reused in the subsequent factorizations of the matrices - * with the same nonzero pattern. - * On exit of sp_colorder(), the columns of A are permuted so that - * the etree is in a certain postorder. This postorder is reflected - * in ScalePermstruct->perm_c. - * NOTE: - * Etree is a vector of parent pointers for a forest whose vertices - * are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * o Glu_persist (Glu_persist_t*) - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (LocalLU_t*) - * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. - * - * berr (output) double*, dimension (nrhs) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - * - * See superlu_zdefs.h for the definitions of various data types. - * - */ - SuperMatrix AC; - NCformat *Astore; - NCPformat *ACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by DDISTRIBUTE - routine. They will be freed after DDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - fact_t Fact; - doublecomplex *a; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *colptr, *rowind; - int_t colequ, Equil, factored, job, notran, rowequ; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; - int iam; - int ldx; /* LDA for matrix X (global). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - doublecomplex *X, *b_col, *b_work, *x_col; - double t; - static mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; -#endif - - /* Test input parameters. */ - *info = 0; - Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) - *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) - *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) - *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) - *info = -1; - else if ( options->IterRefine == EXTRA ) { - *info = -1; - fprintf(stderr, "Extra precise iterative refinement yet to support."); - } else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < A->nrow ) - *info = -5; - else if ( nrhs < 0 ) - *info = -6; - if ( *info ) { - i = -(*info); - pxerbla("pzgssvx_ABglobal", grid, -*info); - return; - } - - /* Initialization */ - factored = (Fact == FACTORED); - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - iam = grid->iam; - job = 5; - m = A->nrow; - n = A->ncol; - Astore = A->Store; - nnz = Astore->nnz; - a = Astore->nzval; - colptr = Astore->colptr; - rowind = Astore->rowind; - if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } else rowequ = colequ = FALSE; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgssvx_ABglobal()"); -#endif - - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - if ( Equil && Fact != SamePattern_SameRowPerm ) { - /* Allocate storage if not done so before. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------*/ - if ( Equil ) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm ) { - /* Reuse R and C. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - break; - case ROW: - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - zd_mult(&a[i], &a[i], R[i]); /* Scale rows. */ - } - } - break; - case COL: - for (j = 0; j < n; ++j) - for (i = colptr[j]; i < colptr[j+1]; ++i) - zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */ - break; - case BOTH: - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - zd_mult(&a[i], &a[i], R[irow]); /* Scale rows. */ - zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */ - } - } - break; - } - } else { - if ( !iam ) { - /* Compute row and column scalings to equilibrate matrix A. */ - zgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); - - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) { - MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); - } else { - if ( iinfo > 0 ) { - if ( iinfo <= m ) - fprintf(stderr, "The %d-th row of A is exactly zero\n", - iinfo); - else fprintf(stderr, "The %d-th column of A is exactly zero\n", - iinfo-n); - exit(-1); - } - } - } else { - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) { - MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); - } else { - ABORT("ZGSEQU failed\n"); - } - } - - /* Equilibrate matrix A. */ - zlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); - if ( lsame_(equed, "R") ) { - ScalePermstruct->DiagScale = rowequ = ROW; - } else if ( lsame_(equed, "C") ) { - ScalePermstruct->DiagScale = colequ = COL; - } else if ( lsame_(equed, "B") ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. equilibrated? *equed = %c\n", *equed); - /*fflush(stdout);*/ - } -#endif - } /* if Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit equil"); -#endif - } /* end if Equil ... */ - - /* ------------------------------------------------------------ - Permute rows of A. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ - || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - } else if ( !factored ) { - if ( job == 5 ) { - /* Allocate storage for scaling factors. */ - if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if ( !iam ) { - /* Process 0 finds a row permutation for large diagonal. */ - zldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1); - - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } else { - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } - -#if ( PRNTlevel>=2 ) - dmin = dlamch_("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if ( job == 5 ) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - zd_mult(&a[i], &a[i], R1[irow]); /* Scale rows. */ - zd_mult(&a[i], &a[i], C1[j]); /* Scale columns. */ - rowind[i] = perm_r[irow]; -#if ( PRNTlevel>=2 ) - if ( rowind[i] == j ) /* New diagonal */ - dprod *= z_abs1(&a[i]); -#endif - } - } - - /* Multiply together the scaling factors. */ - if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; - else for (i = 0; i < m; ++i) R[i] = R1[i]; - if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; - else for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - } else { /* No equilibration. */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; -#if ( PRNTlevel>=2 ) - if ( rowind[i] == j ) { /* New diagonal */ - if ( job == 2 || job == 3 ) - dmin = SUPERLU_MIN(dmin, z_abs1(&a[i])); - else if ( job == 4 ) - dsum += z_abs1(&a[i]); - else if ( job == 5 ) - dprod *= z_abs1(&a[i]); - } -#endif - } - } - } - -#if ( PRNTlevel>=2 ) - if ( job == 2 || job == 3 ) { - if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); - } else if ( job == 4 ) { - if ( !iam ) printf("\tsum of diagonal %e\n", dsum); - } else if ( job == 5 ) { - if ( !iam ) printf("\t product of diagonal %e\n", dprod); - } -#endif - - } /* else !factored */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; - - } else { /* options->RowPerm == NOROWPERM */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - - if ( !factored || options->IterRefine ) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if ( notran ) *(unsigned char *)norm = '1'; - else *(unsigned char *)norm = 'I'; - anorm = zlangs_dist(norm, A); - } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------*/ - if ( !factored ) { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && Fact == DOFACT ) - /* Use an ordering provided by SuperLU */ - get_perm_c_dist(iam, permc_spec, A, perm_c); - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - sp_colorder(options, A, perm_c, etree, &AC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ - ACstore = AC.Store; - for (j = 0; j < n; ++j) - for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { - irow = ACstore->rowind[i]; - ACstore->rowind[i] = perm_c[irow]; - } - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Perform a symbolic factorization on matrix A and set up the - nonzero data structures which are suitable for supernodal GENP. */ - if ( Fact != SamePattern_SameRowPerm ) { -#if ( PRNTlevel>=1 ) - if ( !iam ) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); -#endif - t = SuperLU_timer_(); - if ( !(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) - ABORT("Malloc fails for Glu_freeable."); - - iinfo = symbfact(options, iam, &AC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - - if ( iinfo < 0 ) { - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); - printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu*1e-6, - symb_mem_usage.total*1e-6, - symb_mem_usage.expansions); - } -#endif - } else { - if ( !iam ) { - fprintf(stderr, "symbfact() error returns %d\n", iinfo); - exit(-1); - } - } - } - - /* Distribute the L and U factors onto the process grid. */ - t = SuperLU_timer_(); - dist_mem_use = zdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factor. */ - if ( Fact != SamePattern_SameRowPerm ) { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } - - /* Perform numerical factorization in parallel. */ - t = SuperLU_timer_(); - pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; - -#if ( PRNTlevel>=1 ) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); - MPI_Reduce( &num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - temp = SUPERLU_MAX(symb_mem_usage.total, - symb_mem_usage.for_lu + - (float)dist_mem_use + num_mem_usage.for_lu); - temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce( &temp, &max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &temp, &avg, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm ); - stat->TinyPivots = TinyPivots; - if ( !iam ) { - printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", - for_lu*1e-6, total*1e-6); - printf("\tAll space (MB):" - "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); - printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); - } - } -#endif - -#if ( PRNTlevel>=2 ) - if ( !iam ) printf(".. pzgstrf INFO = %d\n", *info); -#endif - - } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ - /* Permute columns of A to form A*Pc' using the existing perm_c. - * NOTE: rows of A were previously permuted to Pc*A. - */ - sp_colorder(options, A, perm_c, NULL, &AC); - } /* if !factored ... */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------*/ - if ( nrhs ) { - - if ( !(b_work = doublecomplexMalloc_dist(n)) ) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------------ - Scale the right-hand side if equilibration was performed. - ------------------------------------------------------------*/ - if ( notran ) { - if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], R[i]); - b_col += ldb; - } - } - } else if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], C[i]); - b_col += ldb; - } - } - - /* ------------------------------------------------------------ - Permute the right-hand side to form Pr*B. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - if ( notran ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; - for (i = 0; i < m; ++i) b_col[i] = b_work[i]; - b_col += ldb; - } - } - } - - - /* ------------------------------------------------------------ - Permute the right-hand side to form Pc*B. - ------------------------------------------------------------*/ - if ( notran ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; - for (i = 0; i < m; ++i) b_col[i] = b_work[i]; - b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) ) - ABORT("Malloc fails for X[]"); - x_col = X; b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; - x_col += ldx; b_col += ldb; - } - - /* ------------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------------*/ - pzgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); - - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------*/ - if ( options->IterRefine ) { - /* Improve the solution by iterative refinement. */ - t = SuperLU_timer_(); - pzgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, - X, ldx, nrhs, berr, stat, info); - stat->utime[REFINE] = SuperLU_timer_() - t; - } - - /* Permute the solution matrix X <= Pc'*X. */ - for (j = 0; j < nrhs; j++) { - b_col = &B[j*ldb]; - x_col = &X[j*ldx]; - for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; - } - - /* Transform the solution matrix X to a solution of the original system - before the equilibration. */ - if ( notran ) { - if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], C[i]); - b_col += ldb; - } - } - } else if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], R[i]); - b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); -#endif - - /* Deallocate R and/or C if it is not used. */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - } - } - if ( !factored || (factored && options->IterRefine) ) - Destroy_CompCol_Permuted_dist(&AC); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgssvx_ABglobal()"); -#endif -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1280 +0,0 @@ - -#include -#include "superlu_zdefs.h" - -void -pzgssvx(superlu_options_t *options, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid, - LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * -- Distributed SuperLU routine (version 2.2) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * Feburary 20, 2008 - * - * - * Purpose - * ======= - * - * PZGSSVX solves a system of linear equations A*X=B, - * by using Gaussian elimination with "static pivoting" to - * compute the LU factorization of A. - * - * Static pivoting is a technique that combines the numerical stability - * of partial pivoting with the scalability of Cholesky (no pivoting), - * to run accurately and efficiently on large numbers of processors. - * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed - * description of the parallel algorithms. - * - * The input matrices A and B are distributed by block rows. - * Here is a graphical illustration (0-based indexing): - * - * A B - * 0 --------------- ------ - * | | | | - * | | P0 | | - * | | | | - * --------------- ------ - * - fst_row->| | | | - * | | | | | - * m_loc | | P1 | | - * | | | | | - * - | | | | - * --------------- ------ - * | . | |. | - * | . | |. | - * | . | |. | - * --------------- ------ - * - * where, fst_row is the row number of the first row, - * m_loc is the number of rows local to this processor - * These are defined in the 'SuperMatrix' structure, see supermatrix.h. - * - * - * Here are the options for using this code: - * - * 1. Independent of all the other options specified below, the - * user must supply - * - * - B, the matrix of right-hand sides, distributed by block rows, - * and its dimensions ldb (local) and nrhs (global) - * - grid, a structure describing the 2D processor mesh - * - options->IterRefine, which determines whether or not to - * improve the accuracy of the computed solution using - * iterative refinement - * - * On output, B is overwritten with the solution X. - * - * 2. Depending on options->Fact, the user has four options - * for solving A*X=B. The standard option is for factoring - * A "from scratch". (The other options, described below, - * are used when A is sufficiently similar to a previously - * solved problem to save time by reusing part or all of - * the previous factorization.) - * - * - options->Fact = DOFACT: A is factored "from scratch" - * - * In this case the user must also supply - * - * o A, the input matrix - * - * as well as the following options to determine what matrix to - * factorize. - * - * o options->Equil, to specify how to scale the rows and columns - * of A to "equilibrate" it (to try to reduce its - * condition number and so improve the - * accuracy of the computed solution) - * - * o options->RowPerm, to specify how to permute the rows of A - * (typically to control numerical stability) - * - * o options->ColPerm, to specify how to permute the columns of A - * (typically to control fill-in and enhance - * parallelism during factorization) - * - * o options->ReplaceTinyPivot, to specify how to deal with tiny - * pivots encountered during factorization - * (to control numerical stability) - * - * The outputs returned include - * - * o ScalePermstruct, modified to describe how the input matrix A - * was equilibrated and permuted: - * . ScalePermstruct->DiagScale, indicates whether the rows and/or - * columns of A were scaled - * . ScalePermstruct->R, array of row scale factors - * . ScalePermstruct->C, array of column scale factors - * . ScalePermstruct->perm_r, row permutation vector - * . ScalePermstruct->perm_c, column permutation vector - * - * (part of ScalePermstruct may also need to be supplied on input, - * depending on options->RowPerm and options->ColPerm as described - * later). - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix diag(R)*A*diag(C)*Pc^T, where - * Pc is the row permutation matrix determined by - * ScalePermstruct->perm_c - * diag(R) and diag(C) are diagonal scaling matrices determined - * by ScalePermstruct->DiagScale, ScalePermstruct->R and - * ScalePermstruct->C - * - * o LUstruct, which contains the L and U factorization of A1 where - * - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored - * in A on output.) - * - * 3. The second value of options->Fact assumes that a matrix with the same - * sparsity pattern as A has already been factored: - * - * - options->Fact = SamePattern: A is factored, assuming that it has - * the same nonzero pattern as a previously factored matrix. In - * this case the algorithm saves time by reusing the previously - * computed column permutation vector stored in - * ScalePermstruct->perm_c and the "elimination tree" of A - * stored in LUstruct->etree - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->RowPerm - * o options->ReplaceTinyPivot - * - * but not options->ColPerm, whose value is ignored. This is because the - * previous column permutation from ScalePermstruct->perm_c is used as - * input. The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->perm_c, the column permutation - * o LUstruct->etree, the elimination tree - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated and row permuted - * o LUstruct, modified to contain the new L and U factors - * - * 4. The third value of options->Fact assumes that a matrix B with the same - * sparsity pattern as A has already been factored, and where the - * row permutation of B can be reused for A. This is useful when A and B - * have similar numerical values, so that the same row permutation - * will make both factorizations numerically stable. This lets us reuse - * all of the previously computed structure of L and U. - * - * - options->Fact = SamePattern_SameRowPerm: A is factored, - * assuming not only the same nonzero pattern as the previously - * factored matrix B, but reusing B's row permutation. - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->ReplaceTinyPivot - * - * but not options->RowPerm or options->ColPerm, whose values are - * ignored. This is because the permutations from ScalePermstruct->perm_r - * and ScalePermstruct->perm_c are used as input. - * - * The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->DiagScale, how the previous matrix was row - * and/or column scaled - * o ScalePermstruct->R, the row scalings of the previous matrix, - * if any - * o ScalePermstruct->C, the columns scalings of the previous matrix, - * if any - * o ScalePermstruct->perm_r, the row permutation of the previous - * matrix - * o ScalePermstruct->perm_c, the column permutation of the previous - * matrix - * o all of LUstruct, the previously computed information about - * L and U (the actual numerical values of L and U - * stored in LUstruct->Llu are ignored) - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated (thus ScalePermstruct->DiagScale, - * R and C may be modified) - * o LUstruct, modified to contain the new L and U factors - * - * 5. The fourth and last value of options->Fact assumes that A is - * identical to a matrix that has already been factored on a previous - * call, and reuses its entire LU factorization - * - * - options->Fact = Factored: A is identical to a previously - * factorized matrix, so the entire previous factorization - * can be reused. - * - * In this case all the other options mentioned above are ignored - * (options->Equil, options->RowPerm, options->ColPerm, - * options->ReplaceTinyPivot) - * - * The user must also supply - * - * o A, the unfactored matrix, only in the case that iterative - * refinment is to be done (specifically A must be the output - * A from the previous call, so that it has been scaled and permuted) - * o all of ScalePermstruct - * o all of LUstruct, including the actual numerical values of - * L and U - * - * all of which are unmodified on output. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* (global) - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following fields should be defined for this structure: - * - * o Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorized based on the previous history. - * - * = DOFACT: The matrix A will be factorized from scratch. - * Inputs: A - * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * all of ScalePermstruct - * all of LUstruct - * - * = SamePattern: the matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the elimination tree - * LUstruct->etree - * Inputs: A - * options->Equil, RowPerm, ReplaceTinyPivot - * ScalePermstruct->perm_c - * LUstruct->etree - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * rest of ScalePermstruct (DiagScale, R, C, perm_r) - * rest of LUstruct (GLU_persist, Llu) - * - * = SamePattern_SameRowPerm: the matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * Inputs: A - * options->Equil, ReplaceTinyPivot - * all of ScalePermstruct - * all of LUstruct - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * modified LUstruct->Llu - * = FACTORED: the matrix A is already factored. - * Inputs: all of ScalePermstruct - * all of LUstruct - * - * o Equil (yes_no_t) - * Specifies whether to equilibrate the system. - * = NO: no equilibration. - * = YES: scaling factors are computed to equilibrate the system: - * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. - * Whether or not the system will be equilibrated depends - * on the scaling of the matrix A, but if equilibration is - * used, A is overwritten by diag(R)*A*diag(C) and B by - * diag(R)*B. - * - * o RowPerm (rowperm_t) - * Specifies how to permute rows of the matrix A. - * = NATURAL: use the natural ordering. - * = LargeDiag: use the Duff/Koster algorithm to permute rows of - * the original matrix to make the diagonal large - * relative to the off-diagonal. - * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r - * input by the user. - * - * o ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering. - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. - * = MMD_ATA: minimum degree ordering on structure of A'*A. - * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. - * - * o ReplaceTinyPivot (yes_no_t) - * = NO: do not modify pivots - * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during - * LU factorization. - * - * o IterRefine (IterRefine_t) - * Specifies how to perform iterative refinement. - * = NO: no iterative refinement. - * = DOUBLE: accumulate residual in double precision. - * = EXTRA: accumulate residual in extra precision. - * - * NOTE: all options must be indentical on all processes when - * calling this routine. - * - * A (input/output) SuperMatrix* (local) - * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A must be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * That is, A is stored in distributed compressed row format. - * See supermatrix.h for the definition of 'SuperMatrix'. - * This routine only handles square A, however, the LU factorization - * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, - * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by - * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by - * diag(R)*A*diag(C)*Pc^T. - * If all the above condition are true, the LU decomposition is - * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. - * - * ScalePermstruct (input/output) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * It contains the following fields: - * - * o DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: no equilibration. - * = ROW: row equilibration, i.e., A was premultiplied by - * diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied - * by diag(C). - * = BOTH: both row and column equilibration, i.e., A was - * replaced by diag(R)*A*diag(C). - * If options->Fact = FACTORED or SamePattern_SameRowPerm, - * DiagScale is an input argument; otherwise it is an output - * argument. - * - * o perm_r (int*) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->RowPerm = MY_PERMR, or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument; otherwise it is an output argument. - * - * o perm_c (int*) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern - * or options->Fact = SamePattern_SameRowPerm, perm_c is an - * input argument; otherwise, it is an output argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree - * is already in postorder. - * - * o R (double*) dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by - * diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is - * an input argument; otherwise, R is an output argument. - * - * o C (double*) dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by - * diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is - * an input argument; otherwise, C is an output argument. - * - * B (input/output) doublecomplex* (local) - * On entry, the right-hand side matrix of dimension (m_loc, nrhs), - * where, m_loc is the number of rows stored locally on my - * process and is defined in the data structure of matrix A. - * On exit, the solution matrix if info = 0; - * - * ldb (input) int (local) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * The number of right-hand sides. - * If nrhs = 0, only LU decomposition is performed, the forward - * and back substitutions are skipped. - * - * grid (input) gridinfo_t* (global) - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_zdefs.h for the definition of 'gridinfo_t'. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * It contains the following fields: - * - * o etree (int*) dimension (A->ncol) (global) - * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. - * It is computed in sp_colorder() during the first factorization, - * and is reused in the subsequent factorizations of the matrices - * with the same nonzero pattern. - * On exit of sp_colorder(), the columns of A are permuted so that - * the etree is in a certain postorder. This postorder is reflected - * in ScalePermstruct->perm_c. - * NOTE: - * Etree is a vector of parent pointers for a forest whose vertices - * are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * o Glu_persist (Glu_persist_t*) (global) - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (LocalLU_t*) (local) - * The distributed data structures to store L and U factors. - * See superlu_zdefs.h for the definition of 'LocalLU_t'. - * - * SOLVEstruct (input/output) SOLVEstruct_t* - * The data structure to hold the communication pattern used - * in the phases of triangular solution and iterative refinement. - * This pattern should be intialized only once for repeated solutions. - * If options->SolveInitialized = YES, it is an input argument. - * If options->SolveInitialized = NO and nrhs != 0, it is an output - * argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'. - * - * berr (output) double*, dimension (nrhs) (global) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - * See superlu_zdefs.h for the definitions of varioous data types. - * - */ - NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ - NCformat *GAstore; - doublecomplex *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by PDDISTRIBUTE - routine. They will be freed after PDDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - fact_t Fact; - doublecomplex *a; - int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR*/ - int_t *rowind_loc, *colptr_loc; - int_t colequ, Equil, factored, job, notran, rowequ, need_value; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; - int_t nnz_loc, m_loc, fst_row, icol; - int iam; - int ldx; /* LDA for matrix X (local). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - doublecomplex *X, *b_col, *b_work, *x_col; - double t; - static mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; -#endif - int_t procs; - - /* Structures needed for parallel symbolic factorization */ - int_t *sizes, *fstVtxSep, parSymbFact; - int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ - Pslu_freeable_t Pslu_freeable; - float flinfo; - - /* Initialization. */ - m = A->nrow; - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = (doublecomplex *) Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - sizes = NULL; - fstVtxSep = NULL; - symb_comm = MPI_COMM_NULL; - - /* Test the input parameters. */ - *info = 0; - Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) - *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) - *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) - *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) - *info = -1; - else if ( options->IterRefine == EXTRA ) { - *info = -1; - fprintf(stderr, "Extra precise iterative refinement yet to support."); - } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < m_loc ) - *info = -5; - else if ( nrhs < 0 ) - *info = -6; - if ( *info ) { - i = -(*info); - pxerbla("pzgssvx", grid, -*info); - return; - } - - factored = (Fact == FACTORED); - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - iam = grid->iam; - job = 5; - if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } else rowequ = colequ = FALSE; - - /* The following arrays are replicated on all processes. */ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - /********/ - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgssvx()"); -#endif - - /* Not factored & ask for equilibration */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - /* Allocate storage if not done so before. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------*/ - if ( Equil ) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm ) { - /* Reuse R and C. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j+1]; ++i){ - icol = colind[i]; - zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ - } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ - zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ - } - ++irow; - } - break; - } - } else { /* Compute R & C from scratch */ - /* Compute the row and column scalings. */ - pzgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - - /* Equilibrate matrix A if it is badly-scaled. */ - pzlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - - if ( lsame_(equed, "R") ) { - ScalePermstruct->DiagScale = rowequ = ROW; - } else if ( lsame_(equed, "C") ) { - ScalePermstruct->DiagScale = colequ = COL; - } else if ( lsame_(equed, "B") ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. equilibrated? *equed = %c\n", *equed); - /*fflush(stdout);*/ - } -#endif - } /* if Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit equil"); -#endif - } /* if Equil ... */ - - if ( !factored ) { /* Skip this if already factored. */ - /* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - * Numerical values are gathered only when a row permutation - * for large diagonal is sought after. - */ - if ( Fact != SamePattern_SameRowPerm ) { - need_value = (options->RowPerm == LargeDiag); - pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - GAstore = (NCformat *) GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; - nnz = GAstore->nnz; - if ( need_value ) a_GA = (doublecomplex *) GAstore->nzval; - else assert(GAstore->nzval == NULL); - } - - /* ------------------------------------------------------------ - Find the row permutation for A. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - t = SuperLU_timer_(); - if ( Fact != SamePattern_SameRowPerm ) { - if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } else { /* options->RowPerm == LargeDiag */ - /* Get a new perm_r[] */ - if ( job == 5 ) { - /* Allocate storage for scaling factors. */ - if ( !(R1 = doubleMalloc_dist(m)) ) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if ( !(C1 = doubleMalloc_dist(n)) ) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if ( !iam ) { - /* Process 0 finds a row permutation */ - zldperm(job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } else { - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } - -#if ( PRNTlevel>=2 ) - dmin = dlamch_("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if ( job == 5 ) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - - /* Scale the distributed matrix */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - zd_mult(&a[i], &a[i], R1[irow]); - zd_mult(&a[i], &a[i], C1[icol]); -#if ( PRNTlevel>=2 ) - if ( perm_r[irow] == icol ) { /* New diagonal */ - if ( job == 2 || job == 3 ) - dmin = SUPERLU_MIN(dmin, z_abs1(&a[i])); - else if ( job == 4 ) - dsum += z_abs1(&a[i]); - else if ( job == 5 ) - dprod *= z_abs1(&a[i]); - } -#endif - } - ++irow; - } - - /* Multiply together the scaling factors. */ - if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; - else for (i = 0; i < m; ++i) R[i] = R1[i]; - if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; - else for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - -#if ( PRNTlevel>=2 ) - if ( job == 2 || job == 3 ) { - if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); - } else if ( job == 4 ) { - if ( !iam ) printf("\tsum of diagonal %e\n", dsum); - } else if ( job == 5 ) { - if ( !iam ) printf("\t product of diagonal %e\n", dprod); - } -#endif - - } /* end if options->RowPerm ... */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); -#endif - } /* end if Fact ... */ - } else { /* options->RowPerm == NOROWPERM */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - -#if ( DEBUGlevel>=2 ) - if ( !iam ) PrintInt10("perm_r", m, perm_r); -#endif - } /* end if (!factored) */ - - if ( !factored || options->IterRefine ) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if ( notran ) *(unsigned char *)norm = '1'; - else *(unsigned char *)norm = 'I'; - anorm = pzlangs(norm, A, grid); -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. anorm %e\n", anorm); -#endif - } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------*/ - if ( !factored ) { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A - * permc_spec = PARMETIS: parallel METIS on structure of A'+A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - parSymbFact = options->ParSymbFact; - -#if ( PRNTlevel>=1 ) - if ( parSymbFact && permc_spec != PARMETIS ) - if ( !iam ) printf(".. Parallel symbolic factorization" - " only works wth ParMetis!\n"); -#endif - - if ( parSymbFact == YES || permc_spec == PARMETIS ) { - nprocs_num = grid->nprow * grid->npcol; - noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); - - /* create a new communicator for the first noDomains processors in - grid->comm */ - key = iam; - if (iam < noDomains) col = 0; - else col = MPI_UNDEFINED; - MPI_Comm_split (grid->comm, col, key, &symb_comm ); - - permc_spec = PARMETIS; /* only works with PARMETIS */ - } - - if ( permc_spec != MY_PERMC && Fact == DOFACT ) { - if ( permc_spec == PARMETIS ) { - /* Get column permutation vector in perm_c. * - * This routine takes as input the distributed input matrix A * - * and does not modify it. It also allocates memory for * - * sizes[] and fstVtxSep[] arrays, that contain information * - * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); - if (flinfo > 0) - ABORT("ERROR in get perm_c parmetis."); - } else { - get_perm_c_dist(iam, permc_spec, &GA, perm_c); - } - } - - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - if ( Fact != SamePattern_SameRowPerm ) { - if ( parSymbFact == NO ) { - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder(options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *) GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; - } - } - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ -#if ( PRNTlevel>=1 ) - if ( !iam ) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); -#endif - t = SuperLU_timer_(); - if ( !(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) - ABORT("Malloc fails for Glu_freeable."); - - /* Every process does this. */ - iinfo = symbfact(options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if ( iinfo < 0 ) { /* Successful return */ - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); - printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu*1e-6, - symb_mem_usage.total*1e-6, - symb_mem_usage.expansions); - } -#endif - } else { - if ( !iam ) { - fprintf(stderr,"symbfact() error returns %d\n",iinfo); - exit(-1); - } - } - } /* end if serial symbolic factorization */ - else { /* parallel symbolic factorization */ - t = SuperLU_timer_(); - flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (flinfo > 0) - ABORT("Insufficient memory for parallel symbolic factorization."); - } - } /* end if Fact ... */ - - if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); - if (sizes) SUPERLU_FREE (sizes); - if (fstVtxSep) SUPERLU_FREE (fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free (&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_(); - dist_mem_use = pzdistribute(Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factorization. */ - if ( Fact != SamePattern_SameRowPerm ) { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } - } else { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - t = SuperLU_timer_(); - dist_mem_use = zdist_psymbtonum(Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT ("Not enough memory available for dist_psymbtonum\n"); - stat->utime[DIST] = SuperLU_timer_() - t; - } - - if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); - - /* Perform numerical factorization in parallel. */ - t = SuperLU_timer_(); - pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; - -#if ( PRNTlevel>=1 ) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); - MPI_Reduce( &num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - temp = SUPERLU_MAX(symb_mem_usage.total, - symb_mem_usage.for_lu + - (float)dist_mem_use + num_mem_usage.for_lu); - if (parSymbFact == TRUE) - /* The memory used in the redistribution routine - includes the memory used for storing the symbolic - structure and the memory allocated for numerical - factorization */ - temp = SUPERLU_MAX(symb_mem_usage.total, - (float)dist_mem_use); - temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce( &temp, &max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &temp, &avg, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm ); - stat->TinyPivots = TinyPivots; - if ( !iam ) { - printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", - for_lu*1e-6, total*1e-6); - printf("\tAll space (MB):" - "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); - printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); - } - } -#endif - - /* Destroy GA */ - if ( Fact != SamePattern_SameRowPerm ) - Destroy_CompCol_Matrix_dist(&GA); - } /* end if (!factored) */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------*/ - if ( nrhs ) { - - if ( !(b_work = doublecomplexMalloc_dist(n)) ) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------------ - Scale the right-hand side if equilibration was performed. - ------------------------------------------------------------*/ - if ( notran ) { - if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], R[irow]); - ++irow; - } - b_col += ldb; - } - } - } else if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], C[irow]); - ++irow; - } - b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) ) - ABORT("Malloc fails for X[]"); - x_col = X; b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; - x_col += ldx; b_col += ldb; - } - - /* ------------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------------*/ - if ( options->SolveInitialized == NO ) { - zSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, - SOLVEstruct); - } - - pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); - - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------*/ - if ( options->IterRefine ) { - /* Improve the solution by iterative refinement. */ - int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ - - t = SuperLU_timer_(); - if ( options->RefineInitialized == NO || Fact == DOFACT ) { - /* All these cases need to re-initialize gsmv structure */ - if ( options->RefineInitialized ) - pzgsmv_finalize(SOLVEstruct->gsmv_comm); - pzgsmv_init(A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); - - /* Save a copy of the transformed local col indices - in colind_gsmv[]. */ - if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); - if ( !(it = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for colind_gsmv[]"); - colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; - options->RefineInitialized = YES; - } else if ( Fact == SamePattern || - Fact == SamePattern_SameRowPerm ) { - doublecomplex at; - int_t k, jcol, p; - /* Swap to beginning the part of A corresponding to the - local part of X, as was done in pzgsmv_init() */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = rowptr[i]; - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - p = SOLVEstruct->row_to_proc[jcol]; - if ( p == iam ) { /* Local */ - at = a[k]; a[k] = a[j]; a[j] = at; - ++k; - } - } - } - - /* Re-use the local col indices of A obtained from the - previous call to pzgsmv_init() */ - for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; - } - - if ( nrhs == 1 ) { /* Use the existing solve structure */ - SOLVEstruct1 = SOLVEstruct; - } else { /* For nrhs > 1, since refinement is performed for RHS - one at a time, the communication structure for pdgstrs - is different than the solve with nrhs RHS. - So we use SOLVEstruct1 for the refinement step. - */ - if ( !(SOLVEstruct1 = (SOLVEstruct_t *) - SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) - ABORT("Malloc fails for SOLVEstruct1"); - /* Copy the same stuff */ - SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; - SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; - SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; - SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; - SOLVEstruct1->diag_len = SOLVEstruct->diag_len; - SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; - SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; - - /* Initialize the *gstrs_comm for 1 RHS. */ - if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) - ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); - } - - pzgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); - - /* Deallocate the storage associated with SOLVEstruct1 */ - if ( nrhs > 1 ) { - pxgstrs_finalize(SOLVEstruct1->gstrs_comm); - SUPERLU_FREE(SOLVEstruct1); - } - - stat->utime[REFINE] = SuperLU_timer_() - t; - } - - /* Permute the solution matrix B <= Pc'*X. */ - pzPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); -#if ( DEBUGlevel>=2 ) - printf("\n (%d) .. After pzPermute_Dense_Matrix(): b =\n", iam); - for (i = 0; i < m_loc; ++i) - printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); -#endif - - /* Transform the solution matrix X to a solution of the original - system before the equilibration. */ - if ( notran ) { - if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], C[irow]); - ++irow; - } - b_col += ldb; - } - } - } else if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], R[irow]); - ++irow; - } - b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); -#endif - - /* Deallocate R and/or C if it was not used. */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - } - } - if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) - Destroy_CompCol_Permuted_dist(&GAC); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgssvx()"); -#endif - -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,1281 +0,0 @@ - -#include -#include "superlu_zdefs.h" - -void -pzgssvx(superlu_options_t *options, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid, - LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * -- Distributed SuperLU routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * - * Last update: 2/18/2008 - * - * - * Purpose - * ======= - * - * PZGSSVX solves a system of linear equations A*X=B, - * by using Gaussian elimination with "static pivoting" to - * compute the LU factorization of A. - * - * Static pivoting is a technique that combines the numerical stability - * of partial pivoting with the scalability of Cholesky (no pivoting), - * to run accurately and efficiently on large numbers of processors. - * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed - * description of the parallel algorithms. - * - * The input matrices A and B are distributed by block rows. - * Here is a graphical illustration (0-based indexing): - * - * A B - * 0 --------------- ------ - * | | | | - * | | P0 | | - * | | | | - * --------------- ------ - * - fst_row->| | | | - * | | | | | - * m_loc | | P1 | | - * | | | | | - * - | | | | - * --------------- ------ - * | . | |. | - * | . | |. | - * | . | |. | - * --------------- ------ - * - * where, fst_row is the row number of the first row, - * m_loc is the number of rows local to this processor - * These are defined in the 'SuperMatrix' structure, see supermatrix.h. - * - * - * Here are the options for using this code: - * - * 1. Independent of all the other options specified below, the - * user must supply - * - * - B, the matrix of right-hand sides, distributed by block rows, - * and its dimensions ldb (local) and nrhs (global) - * - grid, a structure describing the 2D processor mesh - * - options->IterRefine, which determines whether or not to - * improve the accuracy of the computed solution using - * iterative refinement - * - * On output, B is overwritten with the solution X. - * - * 2. Depending on options->Fact, the user has four options - * for solving A*X=B. The standard option is for factoring - * A "from scratch". (The other options, described below, - * are used when A is sufficiently similar to a previously - * solved problem to save time by reusing part or all of - * the previous factorization.) - * - * - options->Fact = DOFACT: A is factored "from scratch" - * - * In this case the user must also supply - * - * o A, the input matrix - * - * as well as the following options to determine what matrix to - * factorize. - * - * o options->Equil, to specify how to scale the rows and columns - * of A to "equilibrate" it (to try to reduce its - * condition number and so improve the - * accuracy of the computed solution) - * - * o options->RowPerm, to specify how to permute the rows of A - * (typically to control numerical stability) - * - * o options->ColPerm, to specify how to permute the columns of A - * (typically to control fill-in and enhance - * parallelism during factorization) - * - * o options->ReplaceTinyPivot, to specify how to deal with tiny - * pivots encountered during factorization - * (to control numerical stability) - * - * The outputs returned include - * - * o ScalePermstruct, modified to describe how the input matrix A - * was equilibrated and permuted: - * . ScalePermstruct->DiagScale, indicates whether the rows and/or - * columns of A were scaled - * . ScalePermstruct->R, array of row scale factors - * . ScalePermstruct->C, array of column scale factors - * . ScalePermstruct->perm_r, row permutation vector - * . ScalePermstruct->perm_c, column permutation vector - * - * (part of ScalePermstruct may also need to be supplied on input, - * depending on options->RowPerm and options->ColPerm as described - * later). - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix diag(R)*A*diag(C)*Pc^T, where - * Pc is the row permutation matrix determined by - * ScalePermstruct->perm_c - * diag(R) and diag(C) are diagonal scaling matrices determined - * by ScalePermstruct->DiagScale, ScalePermstruct->R and - * ScalePermstruct->C - * - * o LUstruct, which contains the L and U factorization of A1 where - * - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored - * in A on output.) - * - * 3. The second value of options->Fact assumes that a matrix with the same - * sparsity pattern as A has already been factored: - * - * - options->Fact = SamePattern: A is factored, assuming that it has - * the same nonzero pattern as a previously factored matrix. In - * this case the algorithm saves time by reusing the previously - * computed column permutation vector stored in - * ScalePermstruct->perm_c and the "elimination tree" of A - * stored in LUstruct->etree - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->RowPerm - * o options->ReplaceTinyPivot - * - * but not options->ColPerm, whose value is ignored. This is because the - * previous column permutation from ScalePermstruct->perm_c is used as - * input. The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->perm_c, the column permutation - * o LUstruct->etree, the elimination tree - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated and row permuted - * o LUstruct, modified to contain the new L and U factors - * - * 4. The third value of options->Fact assumes that a matrix B with the same - * sparsity pattern as A has already been factored, and where the - * row permutation of B can be reused for A. This is useful when A and B - * have similar numerical values, so that the same row permutation - * will make both factorizations numerically stable. This lets us reuse - * all of the previously computed structure of L and U. - * - * - options->Fact = SamePattern_SameRowPerm: A is factored, - * assuming not only the same nonzero pattern as the previously - * factored matrix B, but reusing B's row permutation. - * - * In this case the user must still specify the following options - * as before: - * - * o options->Equil - * o options->ReplaceTinyPivot - * - * but not options->RowPerm or options->ColPerm, whose values are - * ignored. This is because the permutations from ScalePermstruct->perm_r - * and ScalePermstruct->perm_c are used as input. - * - * The user must also supply - * - * o A, the input matrix - * o ScalePermstruct->DiagScale, how the previous matrix was row - * and/or column scaled - * o ScalePermstruct->R, the row scalings of the previous matrix, - * if any - * o ScalePermstruct->C, the columns scalings of the previous matrix, - * if any - * o ScalePermstruct->perm_r, the row permutation of the previous - * matrix - * o ScalePermstruct->perm_c, the column permutation of the previous - * matrix - * o all of LUstruct, the previously computed information about - * L and U (the actual numerical values of L and U - * stored in LUstruct->Llu are ignored) - * - * The outputs returned include - * - * o A, the input matrix A overwritten by the scaled and permuted - * matrix as described above - * o ScalePermstruct, modified to describe how the input matrix A was - * equilibrated (thus ScalePermstruct->DiagScale, - * R and C may be modified) - * o LUstruct, modified to contain the new L and U factors - * - * 5. The fourth and last value of options->Fact assumes that A is - * identical to a matrix that has already been factored on a previous - * call, and reuses its entire LU factorization - * - * - options->Fact = Factored: A is identical to a previously - * factorized matrix, so the entire previous factorization - * can be reused. - * - * In this case all the other options mentioned above are ignored - * (options->Equil, options->RowPerm, options->ColPerm, - * options->ReplaceTinyPivot) - * - * The user must also supply - * - * o A, the unfactored matrix, only in the case that iterative - * refinment is to be done (specifically A must be the output - * A from the previous call, so that it has been scaled and permuted) - * o all of ScalePermstruct - * o all of LUstruct, including the actual numerical values of - * L and U - * - * all of which are unmodified on output. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* (global) - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following fields should be defined for this structure: - * - * o Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorized based on the previous history. - * - * = DOFACT: The matrix A will be factorized from scratch. - * Inputs: A - * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * all of ScalePermstruct - * all of LUstruct - * - * = SamePattern: the matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the elimination tree - * LUstruct->etree - * Inputs: A - * options->Equil, RowPerm, ReplaceTinyPivot - * ScalePermstruct->perm_c - * LUstruct->etree - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * rest of ScalePermstruct (DiagScale, R, C, perm_r) - * rest of LUstruct (GLU_persist, Llu) - * - * = SamePattern_SameRowPerm: the matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * Inputs: A - * options->Equil, ReplaceTinyPivot - * all of ScalePermstruct - * all of LUstruct - * Outputs: modified A - * (possibly row and/or column scaled and/or - * permuted) - * modified LUstruct->Llu - * = FACTORED: the matrix A is already factored. - * Inputs: all of ScalePermstruct - * all of LUstruct - * - * o Equil (yes_no_t) - * Specifies whether to equilibrate the system. - * = NO: no equilibration. - * = YES: scaling factors are computed to equilibrate the system: - * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. - * Whether or not the system will be equilibrated depends - * on the scaling of the matrix A, but if equilibration is - * used, A is overwritten by diag(R)*A*diag(C) and B by - * diag(R)*B. - * - * o RowPerm (rowperm_t) - * Specifies how to permute rows of the matrix A. - * = NATURAL: use the natural ordering. - * = LargeDiag: use the Duff/Koster algorithm to permute rows of - * the original matrix to make the diagonal large - * relative to the off-diagonal. - * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r - * input by the user. - * - * o ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: natural ordering. - * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. - * = MMD_ATA: minimum degree ordering on structure of A'*A. - * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. - * - * o ReplaceTinyPivot (yes_no_t) - * = NO: do not modify pivots - * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during - * LU factorization. - * - * o IterRefine (IterRefine_t) - * Specifies how to perform iterative refinement. - * = NO: no iterative refinement. - * = DOUBLE: accumulate residual in double precision. - * = EXTRA: accumulate residual in extra precision. - * - * NOTE: all options must be indentical on all processes when - * calling this routine. - * - * A (input/output) SuperMatrix* (local) - * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A must be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * That is, A is stored in distributed compressed row format. - * See supermatrix.h for the definition of 'SuperMatrix'. - * This routine only handles square A, however, the LU factorization - * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, - * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by - * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by - * diag(R)*A*diag(C)*Pc^T. - * If all the above condition are true, the LU decomposition is - * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. - * - * ScalePermstruct (input/output) ScalePermstruct_t* (global) - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the matrix A. - * It contains the following fields: - * - * o DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: no equilibration. - * = ROW: row equilibration, i.e., A was premultiplied by - * diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied - * by diag(C). - * = BOTH: both row and column equilibration, i.e., A was - * replaced by diag(R)*A*diag(C). - * If options->Fact = FACTORED or SamePattern_SameRowPerm, - * DiagScale is an input argument; otherwise it is an output - * argument. - * - * o perm_r (int*) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->RowPerm = MY_PERMR, or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument; otherwise it is an output argument. - * - * o perm_c (int*) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern - * or options->Fact = SamePattern_SameRowPerm, perm_c is an - * input argument; otherwise, it is an output argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree - * is already in postorder. - * - * o R (double*) dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by - * diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is - * an input argument; otherwise, R is an output argument. - * - * o C (double*) dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by - * diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is - * an input argument; otherwise, C is an output argument. - * - * B (input/output) doublecomplex* (local) - * On entry, the right-hand side matrix of dimension (m_loc, nrhs), - * where, m_loc is the number of rows stored locally on my - * process and is defined in the data structure of matrix A. - * On exit, the solution matrix if info = 0; - * - * ldb (input) int (local) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * The number of right-hand sides. - * If nrhs = 0, only LU decomposition is performed, the forward - * and back substitutions are skipped. - * - * grid (input) gridinfo_t* (global) - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_zdefs.h for the definition of 'gridinfo_t'. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * It contains the following fields: - * - * o etree (int*) dimension (A->ncol) (global) - * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. - * It is computed in sp_colorder() during the first factorization, - * and is reused in the subsequent factorizations of the matrices - * with the same nonzero pattern. - * On exit of sp_colorder(), the columns of A are permuted so that - * the etree is in a certain postorder. This postorder is reflected - * in ScalePermstruct->perm_c. - * NOTE: - * Etree is a vector of parent pointers for a forest whose vertices - * are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * o Glu_persist (Glu_persist_t*) (global) - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (LocalLU_t*) (local) - * The distributed data structures to store L and U factors. - * See superlu_zdefs.h for the definition of 'LocalLU_t'. - * - * SOLVEstruct (input/output) SOLVEstruct_t* - * The data structure to hold the communication pattern used - * in the phases of triangular solution and iterative refinement. - * This pattern should be intialized only once for repeated solutions. - * If options->SolveInitialized = YES, it is an input argument. - * If options->SolveInitialized = NO and nrhs != 0, it is an output - * argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'. - * - * berr (output) double*, dimension (nrhs) (global) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - * See superlu_zdefs.h for the definitions of varioous data types. - * - */ - NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ - NCformat *GAstore; - doublecomplex *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by PDDISTRIBUTE - routine. They will be freed after PDDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - fact_t Fact; - doublecomplex *a; - int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR*/ - int_t *rowind_loc, *colptr_loc; - int_t colequ, Equil, factored, job, notran, rowequ, need_value; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; - int_t nnz_loc, m_loc, fst_row, icol; - int iam; - int ldx; /* LDA for matrix X (local). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - doublecomplex *X, *b_col, *b_work, *x_col; - double t; - static mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; -#endif - int_t procs; - - /* Structures needed for parallel symbolic factorization */ - int_t *sizes, *fstVtxSep, parSymbFact; - int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ - Pslu_freeable_t Pslu_freeable; - float flinfo; - - /* Initialization. */ - m = A->nrow; - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = (doublecomplex *) Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - sizes = NULL; - fstVtxSep = NULL; - symb_comm = MPI_COMM_NULL; - - /* Test the input parameters. */ - *info = 0; - Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) - *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) - *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) - *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) - *info = -1; - else if ( options->IterRefine == EXTRA ) { - *info = -1; - fprintf(stderr, "Extra precise iterative refinement yet to support."); - } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( ldb < m_loc ) - *info = -5; - else if ( nrhs < 0 ) - *info = -6; - if ( *info ) { - i = -(*info); - pxerbla("pzgssvx", grid, -*info); - return; - } - - factored = (Fact == FACTORED); - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - iam = grid->iam; - job = 5; - if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } else rowequ = colequ = FALSE; - - /* The following arrays are replicated on all processes. */ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - /********/ - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgssvx()"); -#endif - - /* Not factored & ask for equilibration */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - /* Allocate storage if not done so before. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if ( !(C = (double *) doubleMalloc_dist(n)) ) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if ( !(R = (double *) doubleMalloc_dist(m)) ) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------*/ - if ( Equil ) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if ( Fact == SamePattern_SameRowPerm ) { - /* Reuse R and C. */ - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j+1]; ++i){ - icol = colind[i]; - zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ - } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ - zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ - } - ++irow; - } - break; - } - } else { /* Compute R & C from scratch */ - /* Compute the row and column scalings. */ - pzgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - - /* Equilibrate matrix A if it is badly-scaled. */ - pzlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - - if ( lsame_(equed, "R") ) { - ScalePermstruct->DiagScale = rowequ = ROW; - } else if ( lsame_(equed, "C") ) { - ScalePermstruct->DiagScale = colequ = COL; - } else if ( lsame_(equed, "B") ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. equilibrated? *equed = %c\n", *equed); - /*fflush(stdout);*/ - } -#endif - } /* if Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit equil"); -#endif - } /* if Equil ... */ - - if ( !factored ) { /* Skip this if already factored. */ - /* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - * Numerical values are gathered only when a row permutation - * for large diagonal is sought after. - */ - if ( Fact != SamePattern_SameRowPerm ) { - need_value = (options->RowPerm == LargeDiag); - pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - GAstore = (NCformat *) GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; - nnz = GAstore->nnz; - if ( need_value ) a_GA = (doublecomplex *) GAstore->nzval; - else assert(GAstore->nzval == NULL); - } - - /* ------------------------------------------------------------ - Find the row permutation for A. - ------------------------------------------------------------*/ - if ( options->RowPerm != NO ) { - t = SuperLU_timer_(); - if ( Fact != SamePattern_SameRowPerm ) { - if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } else { /* options->RowPerm == LargeDiag */ - /* Get a new perm_r[] */ - if ( job == 5 ) { - /* Allocate storage for scaling factors. */ - if ( !(R1 = doubleMalloc_dist(m)) ) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if ( !(C1 = doubleMalloc_dist(n)) ) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if ( !iam ) { - /* Process 0 finds a row permutation */ - zldperm(job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } else { - MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); - if ( job == 5 && Equil ) { - MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); - MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); - } - } - -#if ( PRNTlevel>=2 ) - dmin = dlamch_("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if ( job == 5 ) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - - /* Scale the distributed matrix */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j+1]; ++i) { - icol = colind[i]; - zd_mult(&a[i], &a[i], R1[irow]); - zd_mult(&a[i], &a[i], C1[icol]); -#if ( PRNTlevel>=2 ) - if ( perm_r[irow] == icol ) { /* New diagonal */ - if ( job == 2 || job == 3 ) - dmin = SUPERLU_MIN(dmin, z_abs1(&a[i])); - else if ( job == 4 ) - dsum += z_abs1(&a[i]); - else if ( job == 5 ) - dprod *= z_abs1(&a[i]); - } -#endif - } - ++irow; - } - - /* Multiply together the scaling factors. */ - if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; - else for (i = 0; i < m; ++i) R[i] = R1[i]; - if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; - else for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - -#if ( PRNTlevel>=2 ) - if ( job == 2 || job == 3 ) { - if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); - } else if ( job == 4 ) { - if ( !iam ) printf("\tsum of diagonal %e\n", dsum); - } else if ( job == 5 ) { - if ( !iam ) printf("\t product of diagonal %e\n", dprod); - } -#endif - - } /* end if options->RowPerm ... */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); -#endif - } /* end if Fact ... */ - } else { /* options->RowPerm == NOROWPERM */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - -#if ( DEBUGlevel>=2 ) - if ( !iam ) PrintInt10("perm_r", m, perm_r); -#endif - } /* end if (!factored) */ - - if ( !factored || options->IterRefine ) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if ( notran ) *(unsigned char *)norm = '1'; - else *(unsigned char *)norm = 'I'; - anorm = pzlangs(norm, A, grid); -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. anorm %e\n", anorm); -#endif - } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------*/ - if ( !factored ) { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A - * permc_spec = PARMETIS: parallel METIS on structure of A'+A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - parSymbFact = options->ParSymbFact; - -#if ( PRNTlevel>=1 ) - if ( parSymbFact && permc_spec != PARMETIS ) - if ( !iam ) printf(".. Parallel symbolic factorization" - " only works wth ParMetis!\n"); -#endif - - if ( parSymbFact == YES || permc_spec == PARMETIS ) { - nprocs_num = grid->nprow * grid->npcol; - noDomains = (int) ( pow(2, ((int) log2( (double)nprocs_num )))); - - /* create a new communicator for the first noDomains processors in - grid->comm */ - key = iam; - if (iam < noDomains) col = 0; - else col = MPI_UNDEFINED; - MPI_Comm_split (grid->comm, col, key, &symb_comm ); - - permc_spec = PARMETIS; /* only works with PARMETIS */ - } - - if ( permc_spec != MY_PERMC && Fact == DOFACT ) { - if ( permc_spec == PARMETIS ) { - /* Get column permutation vector in perm_c. * - * This routine takes as input the distributed input matrix A * - * and does not modify it. It also allocates memory for * - * sizes[] and fstVtxSep[] arrays, that contain information * - * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); - if (flinfo > 0) - ABORT("ERROR in get perm_c parmetis."); - } else { - get_perm_c_dist(iam, permc_spec, &GA, perm_c); - } - } - - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - if ( Fact != SamePattern_SameRowPerm ) { - if ( parSymbFact == NO ) { - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder(options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *) GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; - } - } - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ -#if ( PRNTlevel>=1 ) - if ( !iam ) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); -#endif - t = SuperLU_timer_(); - if ( !(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) - ABORT("Malloc fails for Glu_freeable."); - - /* Every process does this. */ - iinfo = symbfact(options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if ( iinfo < 0 ) { /* Successful return */ - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); - printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), sizeof(float), - sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu*1e-6, - symb_mem_usage.total*1e-6, - symb_mem_usage.expansions); - } -#endif - } else { - if ( !iam ) { - fprintf(stderr,"symbfact() error returns %d\n",iinfo); - exit(-1); - } - } - } /* end if serial symbolic factorization */ - else { /* parallel symbolic factorization */ - t = SuperLU_timer_(); - flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (flinfo > 0) - ABORT("Insufficient memory for parallel symbolic factorization."); - } - } /* end if Fact ... */ - - if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); - if (sizes) SUPERLU_FREE (sizes); - if (fstVtxSep) SUPERLU_FREE (fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free (&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_(); - dist_mem_use = pzdistribute(Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factorization. */ - if ( Fact != SamePattern_SameRowPerm ) { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } - } else { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - - t = SuperLU_timer_(); - dist_mem_use = zdist_psymbtonum(Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT ("Not enough memory available for dist_psymbtonum\n"); - stat->utime[DIST] = SuperLU_timer_() - t; - } - - if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); - - /* Perform numerical factorization in parallel. */ - t = SuperLU_timer_(); - pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; - -#if ( PRNTlevel>=1 ) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); - MPI_Reduce( &num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - temp = SUPERLU_MAX(symb_mem_usage.total, - symb_mem_usage.for_lu + - (float)dist_mem_use + num_mem_usage.for_lu); - if (parSymbFact == TRUE) - /* The memory used in the redistribution routine - includes the memory used for storing the symbolic - structure and the memory allocated for numerical - factorization */ - temp = SUPERLU_MAX(symb_mem_usage.total, - (float)dist_mem_use); - temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce( &temp, &max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &temp, &avg, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm ); - stat->TinyPivots = TinyPivots; - if ( !iam ) { - printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", - for_lu*1e-6, total*1e-6); - printf("\tAll space (MB):" - "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); - printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); - } - } -#endif - - /* Destroy GA */ - if ( Fact != SamePattern_SameRowPerm ) - Destroy_CompCol_Matrix_dist(&GA); - } /* end if (!factored) */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------*/ - if ( nrhs ) { - - if ( !(b_work = doublecomplexMalloc_dist(n)) ) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------------ - Scale the right-hand side if equilibration was performed. - ------------------------------------------------------------*/ - if ( notran ) { - if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], R[irow]); - ++irow; - } - b_col += ldb; - } - } - } else if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], C[irow]); - ++irow; - } - b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) ) - ABORT("Malloc fails for X[]"); - x_col = X; b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; - x_col += ldx; b_col += ldb; - } - - /* ------------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------------*/ - if ( options->SolveInitialized == NO ) { - zSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, - SOLVEstruct); - } - - pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); - - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------*/ - if ( options->IterRefine ) { - /* Improve the solution by iterative refinement. */ - int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ - - t = SuperLU_timer_(); - if ( options->RefineInitialized == NO || Fact == DOFACT ) { - /* All these cases need to re-initialize gsmv structure */ - if ( options->RefineInitialized ) - pzgsmv_finalize(SOLVEstruct->gsmv_comm); - pzgsmv_init(A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); - - /* Save a copy of the transformed local col indices - in colind_gsmv[]. */ - if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); - if ( !(it = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for colind_gsmv[]"); - colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; - options->RefineInitialized = YES; - } else if ( Fact == SamePattern || - Fact == SamePattern_SameRowPerm ) { - doublecomplex at; - int_t k, jcol, p; - /* Swap to beginning the part of A corresponding to the - local part of X, as was done in pzgsmv_init() */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = rowptr[i]; - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - jcol = colind[j]; - p = SOLVEstruct->row_to_proc[jcol]; - if ( p == iam ) { /* Local */ - at = a[k]; a[k] = a[j]; a[j] = at; - ++k; - } - } - } - - /* Re-use the local col indices of A obtained from the - previous call to pzgsmv_init() */ - for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; - } - - if ( nrhs == 1 ) { /* Use the existing solve structure */ - SOLVEstruct1 = SOLVEstruct; - } else { /* For nrhs > 1, since refinement is performed for RHS - one at a time, the communication structure for pdgstrs - is different than the solve with nrhs RHS. - So we use SOLVEstruct1 for the refinement step. - */ - if ( !(SOLVEstruct1 = (SOLVEstruct_t *) - SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) - ABORT("Malloc fails for SOLVEstruct1"); - /* Copy the same stuff */ - SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; - SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; - SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; - SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; - SOLVEstruct1->diag_len = SOLVEstruct->diag_len; - SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; - SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; - - /* Initialize the *gstrs_comm for 1 RHS. */ - if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) - ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); - } - - pzgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); - - /* Deallocate the storage associated with SOLVEstruct1 */ - if ( nrhs > 1 ) { - pxgstrs_finalize(SOLVEstruct1->gstrs_comm); - SUPERLU_FREE(SOLVEstruct1); - } - - stat->utime[REFINE] = SuperLU_timer_() - t; - } - - /* Permute the solution matrix B <= Pc'*X. */ - pzPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); -#if ( DEBUGlevel>=2 ) - printf("\n (%d) .. After pzPermute_Dense_Matrix(): b =\n", iam); - for (i = 0; i < m_loc; ++i) - printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); -#endif - - /* Transform the solution matrix X to a solution of the original - system before the equilibration. */ - if ( notran ) { - if ( colequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], C[irow]); - ++irow; - } - b_col += ldb; - } - } - } else if ( rowequ ) { - b_col = B; - for (j = 0; j < nrhs; ++j) { - irow = fst_row; - for (i = 0; i < m_loc; ++i) { - zd_mult(&b_col[i], &b_col[i], R[irow]); - ++irow; - } - b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); -#endif - - /* Deallocate R and/or C if it was not used. */ - if ( Equil && Fact != SamePattern_SameRowPerm ) { - switch ( ScalePermstruct->DiagScale ) { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - } - } - if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) - Destroy_CompCol_Permuted_dist(&GAC); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgssvx()"); -#endif - -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1278 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - */ - -#include -#include "superlu_zdefs.h" - -/* - * Internal prototypes - */ -static void pzgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *, - gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *); -#ifdef _CRAY -static void pzgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -static void pzgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); -#endif - -/* - * Sketch of the algorithm - * ======================= - * - * The following relations hold: - * * A_kk = L_kk * U_kk - * * L_ik = Aik * U_kk^(-1) - * * U_kj = L_kk^(-1) * A_kj - * - * ---------------------------------- - * | | | - * ----|----------------------------- - * | | \ U_kk| | - * | | \ | U_kj | - * | |L_kk \ | || | - * ----|-------|---------||---------- - * | | | \/ | - * | | | | - * | | | | - * | | | | - * | | L_ik ==> A_ij | - * | | | | - * | | | | - * | | | | - * ---------------------------------- - * - * Handle the first block of columns separately. - * * Factor diagonal and subdiagonal blocks and test for exact - * singularity. ( pzgstrf2(0), one column at a time ) - * * Compute block row of U - * * Update trailing matrix - * - * Loop over the remaining blocks of columns. - * mycol = MYCOL( iam, grid ); - * myrow = MYROW( iam, grid ); - * N = nsupers; - * For (k = 1; k < N; ++k) { - * krow = PROW( k, grid ); - * kcol = PCOL( k, grid ); - * Pkk = PNUM( krow, kcol, grid ); - * - * * Factor diagonal and subdiagonal blocks and test for exact - * singularity. - * if ( mycol == kcol ) { - * pzgstrf2(k), one column at a time - * } - * - * * Parallel triangular solve - * if ( iam == Pkk ) multicast L_k,k to this process row; - * if ( myrow == krow && mycol != kcol ) { - * Recv L_k,k from process Pkk; - * for (j = k+1; j < N; ++j) - * if ( PCOL( j, grid ) == mycol && A_k,j != 0 ) - * U_k,j = L_k,k \ A_k,j; - * } - * - * * Parallel rank-k update - * if ( myrow == krow ) multicast U_k,k+1:N to this process column; - * if ( mycol == kcol ) multicast L_k+1:N,k to this process row; - * if ( myrow != krow ) { - * Pkj = PNUM( krow, mycol, grid ); - * Recv U_k,k+1:N from process Pkj; - * } - * if ( mycol != kcol ) { - * Pik = PNUM( myrow, kcol, grid ); - * Recv L_k+1:N,k from process Pik; - * } - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L_i,k != 0 && U_k,j != 0 ) - * A_i,j = A_i,j - L_i,k * U_k,j; - * } - * } - * - * - * Remaining issues - * (1) Use local indices for L subscripts and SPA. [DONE] - * - */ -/************************************************************************/ -int_t pzgstrf -/************************************************************************/ -( - superlu_options_t *options, int m, int n, double anorm, - LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info - ) -/* - * Purpose - * ======= - * - * PZGSTRF performs the LU factorization in parallel. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * The following field should be defined: - * o ReplaceTinyPivot (yes_no_t) - * Specifies whether to replace the tiny diagonals by - * sqrt(epsilon)*norm(A) during LU factorization. - * - * m (input) int - * Number of rows in the matrix. - * - * n (input) int - * Number of columns in the matrix. - * - * anorm (input) double - * The norm of the original matrix A, or the scaled A if - * equilibration was done. - * - * LUstruct (input/output) LUstruct_t* - * The data structures to store the distributed L and U factors. - * The following fields should be defined: - * - * o Glu_persist (input) Glu_persist_t* - * Global data structure (xsup, supno) replicated on all processes, - * describing the supernode partition in the factored matrices - * L and U: - * xsup[s] is the leading column of the s-th supernode, - * supno[i] is the supernode number to which column i belongs. - * - * o Llu (input/output) LocalLU_t* - * The distributed data structures to store L and U factors. - * See superlu_zdefs.h for the definition of 'LocalLU_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_zdefs.h for the definition of 'gridinfo_t'. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * - */ -{ -#ifdef _CRAY - _fcd ftcs = _cptofcd("N", strlen("N")); - _fcd ftcs1 = _cptofcd("L", strlen("L")); - _fcd ftcs2 = _cptofcd("N", strlen("N")); - _fcd ftcs3 = _cptofcd("U", strlen("U")); -#endif - doublecomplex zero = {0.0, 0.0}; - doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; - int_t *xsup; - int_t *lsub, *lsub1, *usub, *Usub_buf, - *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - doublecomplex *lusup, *lusup1, *uval, *Uval_buf, - *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, - lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, - nlb, nub, nsupc, rel, rukp; - int_t Pc, Pr; - int iam, kcol, krow, mycol, myrow, pi, pj; - int j, k, lk, nsupers; - int nsupr, nbrow, segsize; - int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: - * 0 : transferred in Lsub_buf[] - * 1 : transferred in Lval_buf[] - * 2 : transferred in Usub_buf[] - * 3 : transferred in Uval_buf[] - */ - int_t msg0, msg2; - int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; - doublecomplex **Unzval_br_ptr, **Lnzval_bc_ptr; - int_t *index; - doublecomplex *nzval; - int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ - doublecomplex *ucol; - int_t *indirect; - doublecomplex *tempv, *tempv2d; - int_t iinfo; - int_t *ToRecv, *ToSendD, **ToSendR; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - superlu_scope_t *scp; - float s_eps; - double thresh; - doublecomplex *tempU2d, *tempu; - int full, ldt, ldu, lead_zero, ncols; - MPI_Request recv_req[4], *send_req; - MPI_Status status; -#if ( DEBUGlevel>=2 ) - int_t num_copy=0, num_update=0; -#endif -#if ( PRNTlevel==3 ) - int_t zero_msg = 0, total_msg = 0; -#endif -#if ( PROFlevel>=1 ) - double t1, t2; - float msg_vol = 0, msg_cnt = 0; - int_t iword = sizeof(int_t), zword = sizeof(doublecomplex); -#endif - - /* Test the input parameters. */ - *info = 0; - if ( m < 0 ) *info = -2; - else if ( n < 0 ) *info = -3; - if ( *info ) { - pxerbla("pzgstrf", grid, -*info); - return (-1); - } - - /* Quick return if possible. */ - if ( m == 0 || n == 0 ) return 0; - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - s_eps = slamch_("Epsilon"); - thresh = s_eps * anorm; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgstrf()"); -#endif - - stat->ops[FACT] = 0.0; - - if ( Pr*Pc > 1 ) { - i = Llu->bufmax[0]; - if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lsub_buf."); - Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; - i = Llu->bufmax[1]; - if ( !(Llu->Lval_buf_2[0] = doublecomplexMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lval_buf[]."); - Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; - if ( Llu->bufmax[2] != 0 ) - if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) - ABORT("Malloc fails for Usub_buf[]."); - if ( Llu->bufmax[3] != 0 ) - if ( !(Llu->Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3])) ) - ABORT("Malloc fails for Uval_buf[]."); - if ( !(send_req = - (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) - ABORT("Malloc fails for send_req[]."); - } - if ( !(Llu->ujrow = doublecomplexMalloc_dist(sp_ienv_dist(3))) ) - ABORT("Malloc fails for ujrow[]."); - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); - printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", - Llu->bufmax[0], Llu->bufmax[1], - Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); - } -#endif - - Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; - Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; - Lval_buf_2[0] = Llu->Lval_buf_2[0]; - Lval_buf_2[1] = Llu->Lval_buf_2[1]; - Usub_buf = Llu->Usub_buf; - Uval_buf = Llu->Uval_buf; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; - ToRecv = Llu->ToRecv; - ToSendD = Llu->ToSendD; - ToSendR = Llu->ToSendR; - - ldt = sp_ienv_dist(3); /* Size of maximum supernode */ - if ( !(tempv2d = doublecomplexCalloc_dist(2*((size_t)ldt)*ldt)) ) - ABORT("Calloc fails for tempv2d[]."); - tempU2d = tempv2d + ldt*ldt; - if ( !(indirect = intMalloc_dist(ldt)) ) - ABORT("Malloc fails for indirect[]."); - k = CEILING( nsupers, Pr ); /* Number of local block rows */ - if ( !(iuip = intMalloc_dist(k)) ) - ABORT("Malloc fails for iuip[]."); - if ( !(ruip = intMalloc_dist(k)) ) - ABORT("Malloc fails for ruip[]."); - - - /* --------------------------------------------------------------- - Handle the first block column separately to start the pipeline. - --------------------------------------------------------------- */ - if ( mycol == 0 ) { - pzgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); - - scp = &grid->rscp; /* The scope of process row. */ - - /* Process column *kcol* multicasts numeric values of L(:,k) - to process rows. */ - lsub = Lrowind_bc_ptr[0]; - lusup = Lnzval_bc_ptr[0]; - if ( lsub ) { - msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub[1] * SuperSize( 0 ); - } else { - msgcnt[0] = msgcnt[1] = 0; - } - - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[0][pj] != EMPTY ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, - &send_req[pj] ); - MPI_Isend( lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, 1, scp->comm, - &send_req[pj+Pc] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, 0, msgcnt[0], msgcnt[1], pj); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*zword; -#endif - } - } /* for pj ... */ - } else { /* Post immediate receives. */ - if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, - 0, scp->comm, &recv_req[0] ); - MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, 0, - 1, scp->comm, &recv_req[1] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); -#endif - } - } /* if mycol == 0 */ - - /* ------------------------------------------ - MAIN LOOP: Loop through all block columns. - ------------------------------------------ */ - for (k = 0; k < nsupers; ++k) { - - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - - if ( mycol == kcol ) { - lk = LBj( k, grid ); /* Local block number. */ - - for (pj = 0; pj < Pc; ++pj) { - /* Wait for Isend to complete before using lsub/lusup. */ - if ( ToSendR[lk][pj] != EMPTY ) { - MPI_Wait( &send_req[pj], &status ); - MPI_Wait( &send_req[pj+Pc], &status ); - } - } - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - } else { - if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ - scp = &grid->rscp; /* The scope of process row. */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[0]);*/ - /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, - (4*k)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[0], &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); - /*probe_recv(iam, kcol, (4*k+1)%NTAGS, SuperLU_MPI_DOUBLE_COMPLEX, scp->comm, - Llu->bufmax[1]);*/ - /*MPI_Recv( Lval_buf, Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, - (4*k+1)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[1], &status ); - MPI_Get_count( &status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1] ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", - iam, k, msgcnt[0], msgcnt[1], kcol); - fflush(stdout); -#endif - lsub = Lsub_buf_2[k%2]; - lusup = Lval_buf_2[k%2]; -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[0] ) ++zero_msg; -#endif - } else msgcnt[0] = 0; - } /* if mycol = Pc(k) */ - - scp = &grid->cscp; /* The scope of process column. */ - - if ( myrow == krow ) { - /* Parallel triangular solve across process row *krow* -- - U(k,j) = L(k,k) \ A(k,j). */ -#ifdef _CRAY - pzgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); -#else - pzgstrs2(n, k, Glu_persist, grid, Llu, stat); -#endif - - /* Multicasts U(k,:) to process columns. */ - lk = LBi( k, grid ); - usub = Ufstnz_br_ptr[lk]; - uval = Unzval_br_ptr[lk]; - if ( usub ) { - msgcnt[2] = usub[2]; - msgcnt[3] = usub[1]; - } else { - msgcnt[2] = msgcnt[3] = 0; - } - - if ( ToSendD[lk] == YES ) { - for (pi = 0; pi < Pr; ++pi) { - if ( pi != myrow ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Send( usub, msgcnt[2], mpi_int_t, pi, - (4*k+2)%NTAGS, scp->comm); - MPI_Send( uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi, - (4*k+3)%NTAGS, scp->comm); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[2]*iword + msgcnt[3]*zword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); -#endif - } /* if pi ... */ - } /* for pi ... */ - } /* if ToSendD ... */ - } else { /* myrow != krow */ - if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[2]);*/ - MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, - (4*k+2)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); - /*probe_recv(iam, krow, (4*k+3)%NTAGS, SuperLU_MPI_DOUBLE_COMPLEX, scp->comm, - Llu->bufmax[3]);*/ - MPI_Recv( Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow, - (4*k+3)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3] ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif - usub = Usub_buf; - uval = Uval_buf; -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); -#endif -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[2] ) ++zero_msg; -#endif - } else msgcnt[2] = 0; - } /* if myrow == Pr(k) */ - - /* - * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L(i,k) != 0 && U(k,j) != 0 ) - * A(i,j) = A(i,j) - L(i,k) * U(k,j); - */ - msg0 = msgcnt[0]; - msg2 = msgcnt[2]; - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - nsupr = lsub[1]; /* LDA of lusup. */ - if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ - lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; - luptr0 = knsupc; - nlb = lsub[0] - 1; - } else { - lptr0 = BC_HEADER; - luptr0 = 0; - nlb = lsub[0]; - } - lptr = lptr0; - for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ - ib = lsub[lptr]; - lib = LBi( ib, grid ); - iuip[lib] = BR_HEADER; - ruip[lib] = 0; - lptr += LB_DESCRIPTOR + lsub[lptr+1]; - } - nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ - rukp = 0; /* Pointer to nzval[] of U(k,:) */ - klst = FstBlockC( k+1 ); - - /* --------------------------------------------------- - Update the first block column A(:,k+1). - --------------------------------------------------- */ - jb = usub[iukp]; /* Global block number of block U(k,j). */ - if ( jb == k+1 ) { /* First update (k+1)-th block. */ - --nub; - lptr = lptr0; - luptr = luptr0; - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = zero; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - CGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#elif defined (USE_VENDOR_BLAS) - zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt, 1, 1); -#else - zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#endif - stat->ops[FACT] += 8 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0, it = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - z_sub(&ucol[rel], &ucol[rel], &tempv[it]); - ++it; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (it = 0, i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - z_sub(&nzval[indirect[rel]], - &nzval[indirect[rel]], - &tempv[it]); - ++it; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* if jb == k+1 */ - } /* if L(:,k) and U(k,:) not empty */ - - - if ( k+1 < nsupers ) { - kcol = PCOL( k+1, grid ); - if ( mycol == kcol ) { - /* Factor diagonal and subdiagonal blocks and test for exact - singularity. */ - pzgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); - - /* Process column *kcol+1* multicasts numeric values of L(:,k+1) - to process rows. */ - lk = LBj( k+1, grid ); /* Local block number. */ - lsub1 = Lrowind_bc_ptr[lk]; - if ( lsub1 ) { - msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub1[1] * SuperSize( k+1 ); - } else { - msgcnt[0] = 0; - msgcnt[1] = 0; - } - scp = &grid->rscp; /* The scope of process row. */ - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[lk][pj] != EMPTY ) { - lusup1 = Lnzval_bc_ptr[lk]; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, - (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); - MPI_Isend( lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, - (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*zword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, k+1, msgcnt[0], msgcnt[1], pj); -#endif - } - } /* for pj ... */ - } else { /* Post Recv of block column L(:,k+1). */ - if ( ToRecv[k+1] >= 1 ) { - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, - (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); - MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, - (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); -#endif - } - } /* if mycol == Pc(k+1) */ - } /* if k+1 < nsupers */ - - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - /* --------------------------------------------------- - Update all other blocks using block row U(k,:) - --------------------------------------------------- */ - for (j = 0; j < nub; ++j) { - lptr = lptr0; - luptr = luptr0; - jb = usub[iukp]; /* Global block number of block U(k,j). */ - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = zero; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - CGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#elif defined (USE_VENDOR_BLAS) - zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt, 1, 1); -#else - zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#endif - stat->ops[FACT] += 8 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - /* Skip descriptor. Now point to fstnz index of - block U(i,j). */ - iuip[lib] += UB_DESCRIPTOR; - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0 ; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - z_sub(&ucol[rel], &ucol[rel], &tempv[i]); - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted for the L blocks. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - z_sub(&nzval[indirect[rel]], - &nzval[indirect[rel]], - &tempv[i]); - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* for j ... */ - } /* if k L(:,k) and U(k,:) are not empty */ - - } - /* ------------------------------------------ - END MAIN LOOP: for k = ... - ------------------------------------------ */ - - - if ( Pr*Pc > 1 ) { - SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ - SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ - if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); - if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); - SUPERLU_FREE(send_req); - } - - SUPERLU_FREE(Llu->ujrow); - SUPERLU_FREE(tempv2d); - SUPERLU_FREE(indirect); - SUPERLU_FREE(iuip); - SUPERLU_FREE(ruip); - - /* Prepare error message. */ - if ( *info == 0 ) *info = n + 1; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - { - float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; - - MPI_Reduce( &msg_cnt, &msg_cnt_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_cnt, &msg_cnt_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - if ( !iam ) { - printf("\tPZGSTRF comm stat:" - "\tAvg\tMax\t\tAvg\tMax\n" - "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", - msg_cnt_sum/Pr/Pc, msg_cnt_max, - msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); - } - } -#endif - if ( iinfo == n + 1 ) *info = 0; - else *info = iinfo; - - -#if ( PRNTlevel==3 ) - MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); - MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # total msg\t%d\n", iinfo); -#endif - -#if ( DEBUGlevel>=2 ) - for (i = 0; i < Pr * Pc; ++i) { - if ( iam == i ) { - zPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); - zPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); - printf("(%d)\n", iam); - PrintInt10("Recv", nsupers, Llu->ToRecv); - } - MPI_Barrier( grid->comm ); - } -#endif - -#if ( DEBUGlevel>=3 ) - printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); -#endif -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgstrf()"); -#endif -} /* PZGSTRF */ - - -/************************************************************************/ -static void pzgstrf2 -/************************************************************************/ -( - superlu_options_t *options, - int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, int* info - ) -/* - * Purpose - * ======= - * Factor diagonal and subdiagonal blocks and test for exact singularity. - * Only the process column that owns block column *k* participates - * in the work. - * - * Arguments - * ========= - * - * k (input) int (global) - * The column number of the block column to be factorized. - * - * thresh (input) double (global) - * The threshold value = s_eps * anorm. - * - * Glu_persist (input) Glu_persist_t* - * Global data structures (xsup, supno) replicated on all processes. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Llu (input/output) LocalLU_t* - * Local data structures to store distributed L and U matrices. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the factorization. - * See SuperLUStat_t structure defined in util.h. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * - */ -{ - int c, iam, l, pkk; - int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int luptr; - int_t i, krow, j, jfst, jlst; - int_t nsupc; /* number of columns in the block */ - int_t *xsup = Glu_persist->xsup; - doublecomplex *lusup, temp; - doublecomplex *ujrow; - doublecomplex one = {1.0, 0.0}, alpha = {-1.0, 0.0}; - *info = 0; - - /* Quick return. */ - - /* Initialization. */ - iam = grid->iam; - krow = PROW( k, grid ); - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - j = LBj( k, grid ); /* Local block number */ - jfst = FstBlockC( k ); - jlst = FstBlockC( k+1 ); - lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize( k ); - if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; - ujrow = Llu->ujrow; - - luptr = 0; /* Point to the diagonal entries. */ - c = nsupc; - for (j = 0; j < jlst - jfst; ++j) { - /* Broadcast the j-th row (nsupc - j) elements to - the process column. */ - if ( iam == pkk ) { /* Diagonal process. */ - i = luptr; - if ( options->ReplaceTinyPivot == YES ) { - if ( z_abs1(&lusup[i]) < thresh ) { /* Diagonal */ -#if ( PRNTlevel>=2 ) - printf("(%d) .. col %d, tiny pivot %e ", - iam, jfst+j, lusup[i]); -#endif - /* Keep the replaced diagonal with the same sign. */ - if ( lusup[i].r < 0 ) lusup[i].r = -thresh; - else lusup[i].r = thresh; - lusup[i].i = 0.0; -#if ( PRNTlevel>=2 ) - printf("replaced by %e\n", lusup[i]); -#endif - ++(stat->TinyPivots); - } - } - for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; - } -#if 0 - dbcast_col(ujrow, c, pkk, UjROW, grid, &c); -#else - MPI_Bcast(ujrow, c, SuperLU_MPI_DOUBLE_COMPLEX, krow, (grid->cscp).comm); - /*bcast_tree(ujrow, c, SuperLU_MPI_DOUBLE_COMPLEX, krow, (24*k+j)%NTAGS, - grid, COMM_COLUMN, &c);*/ -#endif - -#if ( DEBUGlevel>=2 ) -if ( k == 3329 && j == 2 ) { - if ( iam == pkk ) { - printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); - } else { - printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); - } -} -#endif - - if ( !lusup ) { /* Empty block column. */ - --c; - if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) *info = j+jfst+1; - continue; - } - - /* Test for singularity. */ - if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) { - *info = j+jfst+1; - } else { - /* Scale the j-th column of the matrix. */ - z_div(&temp, &one, &ujrow[0]); - if ( iam == pkk ) { - for (i = luptr+1; i < luptr-j+nsupr; ++i) - zz_mult(&lusup[i], &lusup[i], &temp); - stat->ops[FACT] += 6*(nsupr-j-1) + 10; - } else { - for (i = luptr; i < luptr+nsupr; ++i) - zz_mult(&lusup[i], &lusup[i], &temp); - stat->ops[FACT] += 6*nsupr + 10; - } - } - - /* Rank-1 update of the trailing submatrix. */ - if ( --c ) { - if ( iam == pkk ) { - l = nsupr - j - 1; -#ifdef _CRAY - CGERU(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#else - zgeru_(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#endif - stat->ops[FACT] += 8 * l * c; - } else { -#ifdef _CRAY - CGERU(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#else - zgeru_(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#endif - stat->ops[FACT] += 8 * nsupr * c; - } - } - - /* Move to the next column. */ - if ( iam == pkk ) luptr += nsupr + 1; - else luptr += nsupr; - - } /* for j ... */ - -} /* PZGSTRF2 */ - - -/************************************************************************/ -static void pzgstrs2 -/************************************************************************/ -#ifdef _CRAY -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 - ) -#else -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat - ) -#endif -/* - * Purpose - * ======= - * Perform parallel triangular solves - * U(k,:) := A(k,:) \ L(k,k). - * Only the process row that owns block row *k* participates - * in the work. - * - * Arguments - * ========= - * - * m (input) int (global) - * Number of rows in the matrix. - * - * k (input) int (global) - * The row number of the block row to be factorized. - * - * Glu_persist (input) Glu_persist_t* - * Global data structures (xsup, supno) replicated on all processes. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Llu (input/output) LocalLU_t* - * Local data structures to store distributed L and U matrices. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the factorization; - * See SuperLUStat_t structure defined in util.h. - * - */ -{ - int iam, pkk; - int incx = 1; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ - int segsize; - int_t nsupc; /* number of columns in the block */ - int_t luptr, iukp, rukp; - int_t b, gb, j, klst, knsupc, lk, nb; - int_t *xsup = Glu_persist->xsup; - int_t *usub; - doublecomplex *lusup, *uval; - - /* Quick return. */ - lk = LBi( k, grid ); /* Local block number */ - if ( !Llu->Unzval_br_ptr[lk] ) return; - - /* Initialization. */ - iam = grid->iam; - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - klst = FstBlockC( k+1 ); - knsupc = SuperSize( k ); - usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ - uval = Llu->Unzval_br_ptr[lk]; - nb = usub[0]; - iukp = BR_HEADER; - rukp = 0; - if ( iam == pkk ) { - lk = LBj( k, grid ); - nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ - lusup = Llu->Lnzval_bc_ptr[lk]; - } else { - nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ - lusup = Llu->Lval_buf_2[k%2]; - } - - /* Loop through all the row blocks. */ - for (b = 0; b < nb; ++b) { - gb = usub[iukp]; - nsupc = SuperSize( gb ); - iukp += UB_DESCRIPTOR; - - /* Loop through all the segments in the block. */ - for (j = 0; j < nsupc; ++j) { - segsize = klst - usub[iukp++]; - if ( segsize ) { /* Nonzero segment. */ - luptr = (knsupc - segsize) * (nsupr + 1); -#ifdef _CRAY - CTRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#elif defined (USE_VENDOR_BLAS) - ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx, 1, 1, 1); -#else - ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#endif - stat->ops[FACT] += 4 * segsize * (segsize + 1) - + 10 * segsize; /* complex division */ - rukp += segsize; - } - } - } /* for b ... */ - -} /* PZGSTRS2 */ - -static int -probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm, - int buf_size) -{ - MPI_Status status; - int count; - - MPI_Probe( source, tag, comm, &status ); - MPI_Get_count( &status, datatype, &count ); - if ( count > buf_size ) { - printf("(%d) Recv'ed count %d > buffer size $d\n", - iam, count, buf_size); - exit(-1); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,822 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_zdefs.h" - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*, - doublecomplex*, int*, doublecomplex*, int*); -fortran void SGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, - int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - - -void pzgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - doublecomplex *x, int nrhs, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PZGSTRS1 solves a system of distributed linear equations - * - * op( sub(A) ) * X = sub( B ) - * - * with a general N-by-N distributed matrix sub( A ) using the LU - * factorization computed by PZGSTRF. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures to store L and U factors, - * and the permutation vectors. - * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * x (input/output) doublecomplex* - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * NOTE: the right-hand side matrix is already distributed on - * the diagonal processes. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves; - * See SuperLUStat_t structure defined in util.h. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - doublecomplex alpha = {1.0, 0.0}; - doublecomplex zero = {0.0, 0.0}; - doublecomplex *lsum; /* Local running sum of the updates to B-components */ - doublecomplex *lusup, *dest; - doublecomplex *recvbuf, *tempv; - doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - doublecomplex **Lnzval_bc_ptr; - MPI_Status status; -#ifdef ISEND_IRECV - MPI_Request *send_req, recv_req; -#endif - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for L-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -8; - if ( *info ) { - pxerbla("PZGSTRS1", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - Llu->SolveMsgSent = 0; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgstrs1()"); -#endif - - /* Save the count to be altered so it can be used by - subsequent call to PZGSTRS1. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#ifdef ISEND_IRECV - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Compute ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs - + nlb * LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); - if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - - /* - * Prepended the block number in the header for lsum[]. - */ - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H].r = k; - lsum[il - LSUM_H].i = 0; - } - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( !frecv[lk] && !fmod[lk] ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, - pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* - * Compute the internal nodes asynchronously by all processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#ifdef ISEND_IRECV - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &recv_req ); - MPI_Wait( &recv_req, &status ); -#else - MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); -#endif - - k = (*recvbuf).r; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], - &tempv[i + j*knsupc]); - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( DEBUGlevel>=2 ) - if ( !iam ) printf("\n.. After L-solve: y =\n"); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - } - MPI_Barrier( grid->comm ); - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - Llu->SolveMsgSent = 0; -#endif - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PZGSTRS1. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; - } - } - - /* Set up additional pointers for the index and value arrays of U. - nlb is the number of local block rows. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( !brecv[lk] && !bmod[lk] ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); - k = (*recvbuf).r; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], - &tempv[i + j*knsupc]); - - if ( !(--brecv[lk]) && !bmod[lk] ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - - /* Deallocate storage. */ - - SUPERLU_FREE(lsum); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - SUPERLU_FREE(send_req); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgstrs1()"); -#endif - -} /* PZGSTRS1 */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,965 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_zdefs.h" - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*, - doublecomplex*, int*, doublecomplex*, int*); -fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, - int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif -static void gather_diag_to_all(int_t, int_t, doublecomplex [], Glu_persist_t *, - LocalLU_t *, gridinfo_t *, int_t, int_t [], - int_t [], doublecomplex [], int_t, doublecomplex []); - - -void -pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - doublecomplex *B, int_t ldb, int nrhs, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * pzgstrs_Bglobal solves a system of distributed linear equations - * A*X = B with a general N-by-N matrix A using the LU factorization - * computed by pzgstrf. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from pzgstrf for - * the possibly scaled and permuted matrix A. - * See superlu_ddefs.h for the definition of 'LUstruct_t'. - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_ddefs.h for the definition of 'gridinfo_t'. - * - * B (input/output) doublecomplex* - * On entry, the right-hand side matrix of the possibly equilibrated - * and row permuted system. - * On exit, the solution matrix of the possibly equilibrated - * and row permuted system if info = 0; - * - * NOTE: Currently, the N-by-NRHS matrix B must reside on all - * processes when calling this routine. - * - * ldb (input) int (global) - * Leading dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - doublecomplex alpha = {1.0, 0.0}; - doublecomplex zero = {0.0, 0.0}; - doublecomplex *lsum; /* Local running sum of the updates to B-components */ - doublecomplex *x; /* X component at step k. */ - doublecomplex *lusup, *dest; - doublecomplex *recvbuf, *tempv; - doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - doublecomplex **Lnzval_bc_ptr; - MPI_Status status; -#if defined (ISEND_IRECV) || defined (BSEND) - MPI_Request *send_req, recv_req; -#endif - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for L-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerbla("PDGSTRS_BGLOBAL", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - stat->ops[SOLVE] = 0.0; - Llu->SolveMsgSent = 0; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgstrs_Bglobal()"); -#endif - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS_BGLOBAL. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#if defined (ISEND_IRECV) || defined (BSEND) - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs - + nlb * LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs - + nlb * XK_H)) ) - ABORT("Malloc fails for x[]."); - if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - - /* - * Copy B into X on the diagonal processes. - */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H].r = k;/* Block number prepended in the header. */ - lsum[il - LSUM_H].i = 0; - kcol = PCOL( k, grid ); - if ( mycol == kcol ) { /* Diagonal process. */ - jj = X_BLK( lk ); - x[jj - XK_H].r = k; /* Block number prepended in the header. */ - x[jj - XK_H].i = 0; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ - x[i + jj + j*knsupc] = B[i + ii + j*ldb]; - } - } - ii += knsupc; - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( frecv[lk]==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#else - - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, - pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req,stat); - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* ----------------------------------------------------------- - Compute the internal nodes asynchronously by all processes. - ----------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#if 1 - MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); -#else - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &request ); - MPI_Wait( &request, &status ); -#endif - - k = (*recvbuf).r; - - - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - z_add(&x[i + ii + j*knsupc], - &x[i + ii + j*knsupc], - &tempv[i + j*knsupc]); - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( DEBUGlevel>=2 ) - printf("\n(%d) .. After L-solve: y =\n", iam); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - } - MPI_Barrier( grid->comm ); - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - Llu->SolveMsgSent = 0; -#endif - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS_BGLOBAL. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; - } - } - - /* Set up additional pointers for the index and value arrays of U. - nub is the number of local block columns. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( brecv[lk]==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, - MPI_ANY_TAG, grid->comm, &status ); - - k = (*recvbuf).r; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) - for (i = 0; i < knsupc; ++i) - z_add(&x[i + ii + j*knsupc], - &x[i + ii + j*knsupc], - &tempv[i + j*knsupc]); - - if ( (--brecv[lk])==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - - - /* Copy the solution X into B (on all processes). */ - { - int_t num_diag_procs, *diag_procs, *diag_len; - doublecomplex *work; - - get_diag_procs(n, Glu_persist, grid, &num_diag_procs, - &diag_procs, &diag_len); - jj = diag_len[0]; - for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); - if ( !(work = doublecomplexMalloc_dist(((size_t)jj)*nrhs)) ) - ABORT("Malloc fails for work[]"); - gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, - grid, num_diag_procs, diag_procs, diag_len, - B, ldb, work); - SUPERLU_FREE(diag_procs); - SUPERLU_FREE(diag_len); - SUPERLU_FREE(work); - } - - /* Deallocate storage. */ - - SUPERLU_FREE(lsum); - SUPERLU_FREE(x); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - SUPERLU_FREE(send_req); -#endif -#ifdef BSEND - SUPERLU_FREE(send_req); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgstrs_Bglobal()"); -#endif - -} /* PZGSTRS_BGLOBAL */ - - -/* - * Gather the components of x vector on the diagonal processes - * onto all processes, and combine them into the global vector y. - */ -static void -gather_diag_to_all(int_t n, int_t nrhs, doublecomplex x[], - Glu_persist_t *Glu_persist, LocalLU_t *Llu, - gridinfo_t *grid, int_t num_diag_procs, - int_t diag_procs[], int_t diag_len[], - doublecomplex y[], int_t ldy, doublecomplex work[]) -{ - int_t i, ii, j, k, lk, lwork, nsupers, p; - int_t *ilsum, *xsup; - int iam, knsupc, pkk; - doublecomplex *x_col, *y_col; - - iam = grid->iam; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - ilsum = Llu->ilsum; - - for (p = 0; p < num_diag_procs; ++p) { - pkk = diag_procs[p]; - if ( iam == pkk ) { - /* Copy x vector into a buffer. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ - x_col = &x[ii]; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; - lwork += knsupc; - x_col += knsupc; - } - } - MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); - } else { - MPI_Bcast( work, diag_len[p]*nrhs, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); - } - /* Scatter work[] into global y vector. */ - lwork = 0; - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - ii = FstBlockC( k ); - y_col = &y[ii]; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; - lwork += knsupc; - y_col += ldy; - } - } - } -} /* GATHER_DIAG_TO_ALL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1201 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include "superlu_zdefs.h" - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*, - doublecomplex*, int*, doublecomplex*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - - -int_t -pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, - int_t fst_row, int_t *ilsum, doublecomplex *x, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, - gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) -{ -/* - * Purpose - * ======= - * Re-distribute B on the diagonal processes of the 2D process mesh. - * - * Note - * ==== - * This routine can only be called after the routine pxgstrs_init(), - * in which the structures of the send and receive buffers are set up. - * - * Arguments - * ========= - * - * B (input) doublecomplex* - * The distributed right-hand side matrix of the possibly - * equilibrated system. - * - * m_loc (input) int (local) - * The local row dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * ldb (input) int (local) - * Leading dimension of matrix B. - * - * fst_row (input) int (global) - * The row number of B's first row in the global matrix. - * - * ilsum (input) int* (global) - * Starting position of each supernode in a full array. - * - * x (output) doublecomplex* - * The solution vector. It is valid only on the diagonal processes. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * SOLVEstruct (input) SOLVEstruct_t* - * Contains the information for the communication during the - * solution phase. - * - * Return value - * ============ - * - */ - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *perm_r, *perm_c; /* row and column permutation vectors */ - int_t *send_ibuf, *recv_ibuf; - doublecomplex *send_dbuf, *recv_dbuf; - int_t *xsup, *supno; - int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; - int p, procs; - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - SendCnt = gstrs_comm->B_to_X_SendCnt; - SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; - RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; - sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; - rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - /* ------------------------------------------------------------ - NOW COMMUNICATE THE ACTUAL DATA. - ------------------------------------------------------------*/ - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls[p] * nrhs; - } - - /* Copy the row indices and values to the send buffer. */ - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { - irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum( irow ); - p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ - k = ptr_to_ibuf[p]; - send_ibuf[k] = irow; - k = ptr_to_dbuf[p]; - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - send_dbuf[k++] = B[i + j*ldb]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; - } - - /* Communicate the (permuted) row indices. */ - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - - /* Communicate the numerical values. */ - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, - grid->comm); - - /* ------------------------------------------------------------ - Copy buffer into X on the diagonal processes. - ------------------------------------------------------------*/ - ii = 0; - for (p = 0; p < procs; ++p) { - jj = rdispls_nrhs[p]; - for (i = 0; i < RecvCnt[p]; ++i) { - /* Only the diagonal processes do this; the off-diagonal processes - have 0 RecvCnt. */ - irow = recv_ibuf[ii]; /* The permuted row index. */ - k = BlockNum( irow ); - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number. */ - l = X_BLK( lk ); - x[l - XK_H].r = k; /* Block number prepended in the header. */ - x[l - XK_H].i = 0; - irow = irow - FstBlockC(k); /* Relative row number in X-block */ - RHS_ITERATE(j) { - x[l + irow + j*knsupc] = recv_dbuf[jj++]; - } - ++ii; - } - } - - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); -#endif - return 0; -} /* pzReDistribute_B_to_X */ - -int_t -pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t fst_row, - int_t nrhs, doublecomplex *x, int_t *ilsum, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) -{ -/* - * Purpose - * ======= - * Re-distribute X on the diagonal processes to B distributed on all - * the processes. - * - * Note - * ==== - * This routine can only be called after the routine pxgstrs_init(), - * in which the structures of the send and receive buffers are set up. - * - */ - int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; - int_t *xsup, *supno; - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *send_ibuf, *recv_ibuf; - doublecomplex *send_dbuf, *recv_dbuf; - int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - int iam, p, q, pkk, procs; - int_t num_diag_procs, *diag_procs; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pzReDistribute_X_to_B()"); -#endif - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n-1] + 1; - iam = grid->iam; - procs = grid->nprow * grid->npcol; - - SendCnt = gstrs_comm->X_to_B_SendCnt; - SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; - RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; - sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; - rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls_nrhs[p]; - } - num_diag_procs = SOLVEstruct->num_diag_procs; - diag_procs = SOLVEstruct->diag_procs; - - for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number */ - irow = FstBlockC( k ); - l = X_BLK( lk ); - for (i = 0; i < knsupc; ++i) { -#if 0 - ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ -#else - ii = irow; -#endif - q = row_to_proc[ii]; - jj = ptr_to_ibuf[q]; - send_ibuf[jj] = ii; - jj = ptr_to_dbuf[q]; - RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ - send_dbuf[jj++] = x[l + i + j*knsupc]; - } - ++ptr_to_ibuf[q]; - ptr_to_dbuf[q] += nrhs; - ++irow; - } - } - } - } - - /* ------------------------------------------------------------ - COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. - ------------------------------------------------------------*/ - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, - grid->comm); - - /* ------------------------------------------------------------ - COPY THE BUFFER INTO B. - ------------------------------------------------------------*/ - for (i = 0, k = 0; i < m_loc; ++i) { - irow = recv_ibuf[i]; - irow -= fst_row; /* Relative row number */ - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - B[irow + j*ldb] = recv_dbuf[k++]; - } - } - - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pzReDistribute_X_to_B()"); -#endif - return 0; - -} /* pzReDistribute_X_to_B */ - - -void -pzgstrs(int_t n, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, - gridinfo_t *grid, doublecomplex *B, - int_t m_loc, int_t fst_row, int_t ldb, int nrhs, - SOLVEstruct_t *SOLVEstruct, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * PZGSTRS solves a system of distributed linear equations - * A*X = B with a general N-by-N matrix A using the LU factorization - * computed by PZGSTRF. - * If the equilibration, and row and column permutations were performed, - * the LU factorization was performed for A1 where - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * and the linear system solved is - * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and - * the permutation to B1 by Pc*Pr is applied internally in this routine. - * - * Arguments - * ========= - * - * n (input) int (global) - * The order of the system of linear equations. - * - * LUstruct (input) LUstruct_t* - * The distributed data structures storing L and U factors. - * The L and U factors are obtained from PZGSTRF for - * the possibly scaled and permuted matrix A. - * See superlu_zdefs.h for the definition of 'LUstruct_t'. - * A may be scaled and permuted into A1, so that - * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U - * - * grid (input) gridinfo_t* - * The 2D process mesh. It contains the MPI communicator, the number - * of process rows (NPROW), the number of process columns (NPCOL), - * and my process rank. It is an input argument to all the - * parallel routines. - * Grid can be initialized by subroutine SUPERLU_GRIDINIT. - * See superlu_defs.h for the definition of 'gridinfo_t'. - * - * B (input/output) doublecomplex* - * On entry, the distributed right-hand side matrix of the possibly - * equilibrated system. That is, B may be overwritten by diag(R)*B. - * On exit, the distributed solution matrix Y of the possibly - * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, - * and X is the solution of the original system. - * - * m_loc (input) int (local) - * The local row dimension of matrix B. - * - * fst_row (input) int (global) - * The row number of B's first row in the global matrix. - * - * ldb (input) int (local) - * The leading dimension of matrix B. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * SOLVEstruct (output) SOLVEstruct_t* (global) - * Contains the information for the communication during the - * solution phase. - * - * stat (output) SuperLUStat_t* - * Record the statistics about the triangular solves. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - doublecomplex alpha = {1.0, 0.0}; - doublecomplex zero = {0.0, 0.0}; - doublecomplex *lsum; /* Local running sum of the updates to B-components */ - doublecomplex *x; /* X component at step k. */ - /* NOTE: x and lsum are of same size. */ - doublecomplex *lusup, *dest; - doublecomplex *recvbuf, *tempv; - doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t iam, kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb, nub, nsupers; - int_t *xsup, *supno, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int_t Pc, Pr; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - doublecomplex **Lnzval_bc_ptr; - MPI_Status status; -#ifdef ISEND_IRECV - MPI_Request *send_req, recv_req; -#endif - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve -- - Count the number of local block products to - be summed into lsum[lk]. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t *frecv; /* Count of lsum[lk] contributions to be received - from processes in this row. - It is only valid on the diagonal processes. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ - double t; -#if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; -#endif - - t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerbla("PZGSTRS", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pzgstrs()"); -#endif - - stat->ops[SOLVE] = 0.0; - Llu->SolveMsgSent = 0; - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - -#ifdef ISEND_IRECV - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); -#endif - -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); -#endif - - - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) - ABORT("Malloc fails for x[]."); - if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for rtemp[]."); - - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - /* Redistribute B into X on the diagonal processes. */ - pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, - ScalePermstruct, Glu_persist, grid, SOLVEstruct); - - /* Set up the headers in lsum[]. */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ - lsum[il - LSUM_H].i = 0; - } - ii += knsupc; - } - - /* - * Compute frecv[] and nfrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && fmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nfrecvmod += frecv[lk]; - if ( !frecv[lk] && !fmod[lk] ) ++nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); - assert( frecv[lk] < Pc ); -#endif - } - } - } - } - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); -#endif - for (k = 0; k < nsupers && nleaf; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - if ( frecv[lk]==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ - --nleaf; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } - } /* if diagonal process ... */ - } /* for k ... */ - - /* ----------------------------------------------------------- - Compute the internal nodes asynchronously by all processes. - ----------------------------------------------------------- */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", - iam, nfrecvx, nfrecvmod, nleaf); -#endif - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - /* Receive a message. */ -#ifdef ISEND_IRECV - /* -MPI- FATAL: Remote protocol queue full */ - MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); - MPI_Wait( &recv_req, &status ); -#else - MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); -#endif - - k = (*recvbuf).r; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nfrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nfrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) - z_add(&x[i + ii + j*knsupc], - &x[i + ii + j*knsupc], - &tempv[i + j*knsupc]); - } - - if ( (--frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - } /* switch */ - - } /* while not finished ... */ - - -#if ( PRNTlevel>=2 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - -#if ( DEBUGlevel==2 ) - { - printf("(%d) .. After L-solve: y =\n", iam); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - } -#endif - - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); - SUPERLU_FREE(rtemp); - -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - Llu->SolveMsgSent = 0; -#endif - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PZGSTRS. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; - - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif - } - } - } - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; - } - } - } - - /* Set up additional pointers for the index and value arrays of U. - nub is the number of local block columns. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); - } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); - } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ - - -#if ( PRNTlevel>=3 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); - t = SuperLU_timer_(); -#endif - - /* - * Solve the roots first by all the diagonal processes. - */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); -#endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( brecv[lk]==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, - grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); - k = (*recvbuf).r; - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); -#endif - - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - - break; - - case LSUM: /* Receiver must be a diagonal process */ - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) - z_add(&x[i + ii + j*knsupc], - &x[i + ii + j*knsupc], - &tempv[i + j*knsupc]); - } - - if ( (--brecv[lk])==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &knsupc); -#endif - stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, - grid->comm ); -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif - } - } - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - if ( Urbs[lk] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if becomes solvable */ - - break; - -#if ( DEBUGlevel>=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); - break; -#endif - - } /* switch */ - - } /* while not finished ... */ - -#if ( PRNTlevel>=3 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); -#endif - -#if ( DEBUGlevel>=2 ) - { - doublecomplex *x_col; - int diag; - printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam); - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - diag = PNUM( krow, kcol, grid); - if ( iam == diag ) { /* Diagonal process. */ - lk = LBi( k, grid ); - jj = X_BLK( lk ); - x_col = &x[jj]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) { /* X stored in blocks */ - printf("\t(%d)\t%4d\t%.10f\n", - iam, xsup[k]+i, x_col[i]); - } - x_col += knsupc; - } - } - ii += knsupc; - } /* for k ... */ - } -#endif - - pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, - ScalePermstruct, Glu_persist, grid, SOLVEstruct); - - - /* Deallocate storage. */ - SUPERLU_FREE(lsum); - SUPERLU_FREE(x); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) { - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); -#ifdef ISEND_IRECV - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); - SUPERLU_FREE(send_req); -#endif - - stat->utime[SOLVE] = SuperLU_timer_() - t; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pzgstrs()"); -#endif - -} /* PZGSTRS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,368 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * Modified: - * Feburary 7, 2001 use MPI_Isend/MPI_Irecv - * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test - */ - -#include "superlu_zdefs.h" - -#define ISEND_IRECV - -/* - * Function prototypes - */ -#ifdef _CRAY -fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*, - doublecomplex*, int*, doublecomplex*, int*); -fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, - int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*); -_fcd ftcs1; -_fcd ftcs2; -_fcd ftcs3; -#endif - -/************************************************************************/ -void zlsum_fmod -/************************************************************************/ -( - doublecomplex *lsum, /* Sum of local modifications. */ - doublecomplex *x, /* X array (local) */ - doublecomplex *xk, /* X[k]. */ - doublecomplex *rtemp, /* Result of full matrix-vector multiply. */ - int nrhs, /* Number of right-hand sides. */ - int knsupc, /* Size of supernode k. */ - int_t k, /* The k-th component of X. */ - int_t *fmod, /* Modification count for L-solve. */ - int_t nlb, /* Number of L blocks. */ - int_t lptr, /* Starting position in lsub[*]. */ - int_t luptr, /* Starting position in lusup[*]. */ - int_t *xsup, - gridinfo_t *grid, - LocalLU_t *Llu, - MPI_Request send_req[], - SuperLUStat_t *stat -) -{ -/* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= L_i,k * X[k]. - */ - doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; - doublecomplex *lusup, *lusup1; - doublecomplex *dest; - int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; - int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel; - int_t *lsub, *lsub1, nlb1, lptr1, luptr1; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *frecv = Llu->frecv; - int_t **fsendx_plist = Llu->fsendx_plist; - MPI_Status status; - int test_flag; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Llu->Lrowind_bc_ptr[lk]; - lusup = Llu->Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; - - for (lb = 0; lb < nlb; ++lb) { - ik = lsub[lptr]; /* Global block number, row-wise. */ - nbrow = lsub[lptr+1]; -#ifdef _CRAY - CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow ); -#elif defined (USE_VENDOR_BLAS) - zgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow, 1, 1 ); -#else - zgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow ); -#endif - stat->ops[SOLVE] += 8 * nbrow * nrhs * knsupc + 2 * nbrow * nrhs; - - lk = LBi( ik, grid ); /* Local block number, row-wise. */ - iknsupc = SuperSize( ik ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - lptr += LB_DESCRIPTOR; - rel = xsup[ik]; /* Global row index of block ik. */ - for (i = 0; i < nbrow; ++i) { - irow = lsub[lptr++] - rel; /* Relative row. */ - RHS_ITERATE(j) - z_sub(&dest[irow + j*iknsupc], - &dest[irow + j*iknsupc], - &rtemp[i + j*nbrow]); - } - luptr += nbrow; - - if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ - ikcol = PCOL( ik, grid ); - p = PNUM( myrow, ikcol, grid ); - if ( iam != p ) { -#ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm ); -#else - MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -#endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( lk ); - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - z_add(&x[i + ii + j*iknsupc], - &x[i + ii + j*iknsupc], - &lsum[i + il + j*iknsupc]); - if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( ik, grid );/* Local block number, column-wise. */ - lsub1 = Llu->Lrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc); -#endif - stat->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, ikcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - nlb1 = lsub1[0] - 1; - lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; - luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ - - zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, - fmod, nlb1, lptr1, luptr1, xsup, - grid, Llu, send_req, stat); - } /* if frecv[lk] == 0 */ - } /* if iam == p */ - } /* if fmod[lk] == 0 */ - - } /* for lb ... */ - -} /* zLSUM_FMOD */ - - -/************************************************************************/ -void zlsum_bmod -/************************************************************************/ -( - doublecomplex *lsum, /* Sum of local modifications. */ - doublecomplex *x, /* X array (local). */ - doublecomplex *xk, /* X[k]. */ - int nrhs, /* Number of right-hand sides. */ - int_t k, /* The k-th component of X. */ - int_t *bmod, /* Modification count for L-solve. */ - int_t *Urbs, /* Number of row blocks in each block column of U.*/ - Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ - int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ - int_t *xsup, - gridinfo_t *grid, - LocalLU_t *Llu, - MPI_Request send_req[], - SuperLUStat_t *stat - ) -{ -/* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= U_i,k * X[k]. - */ - doublecomplex alpha = {1.0, 0.0}; - int iam, iknsupc, knsupc, myrow, nsupr, p, pi; - int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, - j, jj, lk, lk1, nub, ub, uptr; - int_t *usub; - doublecomplex *uval, *dest, *y; - doublecomplex temp; - int_t *lsub; - doublecomplex *lusup; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *brecv = Llu->brecv; - int_t **bsendx_plist = Llu->bsendx_plist; - MPI_Status status; - int test_flag; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - knsupc = SuperSize( k ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - nub = Urbs[lk]; /* Number of U blocks in block column lk */ - - for (ub = 0; ub < nub; ++ub) { - ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ - usub = Llu->Ufstnz_br_ptr[ik]; - uval = Llu->Unzval_br_ptr[ik]; - i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ - i += UB_DESCRIPTOR; - il = LSUM_BLK( ik ); - gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ - iknsupc = SuperSize( gik ); - ikfrow = FstBlockC( gik ); - iklrow = FstBlockC( gik+1 ); - - RHS_ITERATE(j) { - dest = &lsum[il + j*iknsupc]; - y = &xk[j*knsupc]; - uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ - for (jj = 0; jj < knsupc; ++jj) { - fnz = usub[i + jj]; - if ( fnz < iklrow ) { /* Nonzero segment. */ - /* AXPY */ - for (irow = fnz; irow < iklrow; ++irow) { - zz_mult(&temp, &uval[uptr], &y[jj]); - z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow], - &temp); - ++uptr; - } - stat->ops[SOLVE] += 8 * (iklrow - fnz); - } - } /* for jj ... */ - } - - if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ - gikcol = PCOL( gik, grid ); - p = PNUM( myrow, gikcol, grid ); - if ( iam != p ) { -#ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm ); -#else - MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -#endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( ik ); - dest = &x[ii]; - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - z_add(&dest[i + j*iknsupc], &dest[i + j*iknsupc], - &lsum[i + il + j*iknsupc]); - if ( !brecv[ik] ) { /* Becomes a leaf node. */ - bmod[ik] = -1; /* Do not solve X[k] in the future. */ - lk1 = LBj( gik, grid ); /* Local block number. */ - lsub = Llu->Lrowind_bc_ptr[lk1]; - lusup = Llu->Lnzval_bc_ptr[lk1]; - nsupr = lsub[1]; -#ifdef _CRAY - CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc); -#elif defined (USE_VENDOR_BLAS) - ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); -#else - ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc); -#endif - stat->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs - + 10 * iknsupc * nrhs; /* complex division */ -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, gik); -#endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( bsendx_plist[lk1][p] != EMPTY ) { - pi = PNUM( p, gikcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } - /* - * Perform local block modifications. - */ - if ( Urbs[lk1] ) - zlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if brecv[ik] == 0 */ - } - } /* if bmod[ik] == 0 */ - - } /* for ub ... */ - -} /* zlSUM_BMOD */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlangs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,127 +0,0 @@ - -/* - * File name: pzlangs.c - * History: Modified from lapack routine ZLANGE - */ -#include -#include "superlu_zdefs.h" - -double pzlangs(char *norm, SuperMatrix *A, gridinfo_t *grid) -{ -/* - Purpose - ======= - - PZLANGS returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - PZLANGE returns the value - - PZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in DLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - GRID (input) gridinof_t* - The 2D process mesh. - ===================================================================== -*/ - - /* Local variables */ - NRformat_loc *Astore; - int_t m_loc; - doublecomplex *Aval; - int_t i, j, irow, jcol; - double value=0., sum; - double *rwork; - double tempvalue; - double *temprwork; - - Astore = (NRformat_loc *) A->Store; - m_loc = Astore->m_loc; - Aval = (doublecomplex *) Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - value = SUPERLU_MAX( value, z_abs(&Aval[j]) ); - } - - MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - value = tempvalue; - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; -#if 0 - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += fabs(Aval[i]); - value = SUPERLU_MAX(value,sum); - } -#else /* XSL ==> */ - if ( !(rwork = (double *) doubleCalloc_dist(A->ncol)) ) - ABORT("doubleCalloc_dist fails for rwork."); - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - rwork[jcol] += z_abs(&Aval[j]); - } - } - - if ( !(temprwork = (double *) doubleCalloc_dist(A->ncol)) ) - ABORT("doubleCalloc_dist fails for temprwork."); - MPI_Allreduce(rwork, temprwork, A->ncol, MPI_DOUBLE, MPI_SUM, grid->comm); - value = 0.; - for (j = 0; j < A->ncol; ++j) { - value = SUPERLU_MAX(value, temprwork[j]); - } - SUPERLU_FREE (temprwork); - SUPERLU_FREE (rwork); -#endif - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - value = 0.; - sum = 0.; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - sum += z_abs(&Aval[j]); - value = SUPERLU_MAX(value, sum); - } - MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - value = tempvalue; - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else { - ABORT("Illegal norm specified."); - } - - return (value); - -} /* pzlangs */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,135 +0,0 @@ - -/* - * File name: pzlaqgs.c - * History: Modified from LAPACK routine ZLAQGE - */ -#include -#include "superlu_zdefs.h" - -void -pzlaqgs(SuperMatrix *A, double *r, double *c, - double rowcnd, double colcnd, double amax, char *equed) -{ -/* - Purpose - ======= - - PZLAQGS equilibrates a general sparse M by N matrix A using the row - and column scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - - R (input) double*, dimension (A->nrow) - The row scale factors for A. - - C (input) double*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) double - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) double - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) double - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NRformat_loc *Astore; - doublecomplex *Aval; - int_t i, j, irow, jcol, m_loc; - double large, small, cj; - extern double dlamch_(char *); - double temp; - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - m_loc = Astore->m_loc; - - /* Initialize LARGE and SMALL. */ - small = dlamch_("Safe minimum") / dlamch_("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - zd_mult(&Aval[j], &Aval[j], c[jcol]); - } - ++irow; - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - zd_mult(&Aval[j], &Aval[j], r[irow]); - ++irow; - } - *(unsigned char *)equed = 'R'; - } else { - /* Both row and column scaling */ - irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - temp = r[irow] * c[jcol]; - zd_mult(&Aval[j], &Aval[j], temp); - } - ++irow; - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* pzlaqgs */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1942 +0,0 @@ - -/* - * -- Parallel symbolic factorization auxialiary routine (version 2.2) -- - * -- Distributes the data from parallel symbolic factorization - * -- to numeric factorization - * INRIA France - July 1, 2004 - * Laura Grigori - * - * November 1, 2007 - * Feburary 20, 2008 - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include - -#include "superlu_zdefs.h" -#include "psymbfact.h" - -static float -dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, - Glu_persist_t *Glu_persist, - int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, - gridinfo_t *grid - ) -/* - * Purpose - * ======= - * - * Redistribute the symbolic structure of L and U from the distribution - * used in the parallel symbolic factorization step to the distdibution - * used in the parallel numeric factorization step. On exit, the L and U - * structure for the 2D distribution used in the numeric factorization step is - * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal - * information is also computed and it is stored in Glu_persist->supno - * and Glu_persist->xsup. - * - * This routine allocates memory for storing the structure of L and U - * and the supernodes information. This represents the arrays: - * p_xlsub, p_lsub, p_xusub, p_usub, - * Glu_persist->supno, Glu_persist->xsup. - * - * This routine also deallocates memory allocated during symbolic - * factorization routine. That is, the folloing arrays are freed: - * Pslu_freeable->xlsub, Pslu_freeable->lsub, - * Pslu_freeable->xusub, Pslu_freeable->usub, - * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, - * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc. - * - * Arguments - * ========= - * - * n (Input) int_t - * Order of the input matrix - * Pslu_freeable (Input) Pslu_freeable_t * - * Local L and U structure, - * global to local indexing information. - * - * Glu_persist (Output) Glu_persist_t * - * Stores on output the information on supernodes mapping. - * - * p_xlsub (Output) int_t ** - * Pointer to structure of L distributed on a 2D grid - * of processors, stored by columns. - * - * p_lsub (Output) int_t ** - * Structure of L distributed on a 2D grid of processors, - * stored by columns. - * - * p_xusub (Output) int_t ** - * Pointer to structure of U distributed on a 2D grid - * of processors, stored by rows. - * - * p_usub (Output) int_t ** - * Structure of U distributed on a 2D grid of processors, - * stored by rows. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU. - * > 0, number of bytes allocated in this routine when out of memory. - * (an approximation). - */ -{ - int iam, nprocs, pc, pr, p, np, p_diag; - int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, - *tmp_ptrToSend, *mem; - int_t *nnzToRecv_l, *nnzToRecv_u; - int_t *send_1, *send_2, nsend_1, nsend_2; - int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; - int_t nsupers, nsupers_i, nsupers_j; - int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; - int_t maxszsn, maxNvtcsPProc; - int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; - int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; - int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; - int_t *xsub_s, *sub_s, *xsub_n, *sub_n; - int_t *globToLoc, nvtcs_loc; - int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, - RecvCnt_l, RecvCnt_u, ind_loc; - int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; - int_t nelts, isize; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_symbLU()"); -#endif - nprocs = (int) grid->nprow * grid->npcol; - xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; - xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = Pslu_freeable->nvtcs_loc; - xsup_beg_s = Pslu_freeable->xsup_beg_loc; - xsup_end_s = Pslu_freeable->xsup_end_loc; - supno_s = Pslu_freeable->supno_loc; - rcv_luind = NULL; - iword = sizeof(int_t); - dword = sizeof(doublecomplex); - memAux = 0.; memRet = 0.; - - mem = intCalloc_dist(12 * nprocs); - if (!mem) - return (ERROR_RET); - memAux = (float) (12 * nprocs * sizeof(int_t)); - nnzToRecv = mem; - nnzToSend = nnzToRecv + 2*nprocs; - nnzToSend_l = nnzToSend + 2 * nprocs; - nnzToSend_u = nnzToSend_l + nprocs; - send_1 = nnzToSend_u + nprocs; - send_2 = send_1 + nprocs; - tmp_ptrToSend = send_2 + nprocs; - nnzToRecv_l = tmp_ptrToSend + nprocs; - nnzToRecv_u = nnzToRecv_l + nprocs; - - ptrToSend = nnzToSend; - ptrToRecv = nnzToSend + nprocs; - - nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); - intBuf1 = nvtcs + nprocs; - intBuf2 = nvtcs + 2 * nprocs; - intBuf3 = nvtcs + 3 * nprocs; - intBuf4 = nvtcs + 4 * nprocs; - memAux += 5 * nprocs * sizeof(int); - - maxszsn = sp_ienv_dist(3); - - /* Allocate space for storing Glu_persist_n. */ - if ( !(supno_n = intMalloc_dist(n+1)) ) { - fprintf (stderr, "Malloc fails for supno_n[]."); - return (memAux); - } - memRet += (float) ((n+1) * sizeof(int_t)); - - /* ------------------------------------------------------------ - DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION - ------------------------------------------------------------*/ - - if (nvtcs_loc > INT_MAX) - ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); - intNvtcs_loc = (int) nvtcs_loc; - MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, - 0, grid->comm); - - if (!iam) { - /* set ptrToRecv to point to the beginning of the data for - each processor */ - for (k = 0, p = 0; p < nprocs; p++) { - ptrToRecv[p] = k; - k += nvtcs[p]; - } - } - - if (nprocs > 1) { - temp = NULL; - if (!iam ) { - if ( !(temp = intMalloc_dist (n+1)) ) { - fprintf (stderr, "Malloc fails for temp[]."); - return (memAux + memRet); - } - memAux += (float) (n+1) * iword; - } -#if defined (_LONGINT) - for (p=0; p INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = ptrToRecv; -#endif - MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, - temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); - } - else - temp = supno_s; - - if (!iam) { - nsupers = 0; - p = (int) OWNER( globToLoc[0] ); - gb = temp[ptrToRecv[p]]; - supno_n[0] = nsupers; - ptrToRecv[p] ++; - szsn = 1; - for (j = 1; j < n; j ++) { - if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { - nsupers ++; - p = (int) OWNER( globToLoc[j] ); - gb = temp[ptrToRecv[p]]; - szsn = 1; - } - else { - szsn ++; - } - ptrToRecv[p] ++; - supno_n[j] = nsupers; - } - nsupers++; - if (nprocs > 1) { - SUPERLU_FREE (temp); - memAux -= (float) (n+1) * iword; - } - supno_n[n] = nsupers; - } - - /* reset to 0 nnzToSend */ - for (p = 0; p < 2 *nprocs; p++) - nnzToSend[p] = 0; - - MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); - nsupers = supno_n[n]; - /* Allocate space for storing Glu_persist_n. */ - if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { - fprintf (stderr, "Malloc fails for xsup_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers+1) * iword; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF L and U. - ------------------------------------------------------------*/ - gb = EMPTY; - for (i = 0; i < n; i++) { - if (gb != supno_n[i]) { - /* a new supernode starts */ - gb = supno_n[i]; - xsup_n[gb] = i; - } - } - xsup_n[nsupers] = n; - - for (p = 0; p < nprocs; p++) { - send_1[p] = FALSE; - send_2[p] = FALSE; - } - for (gb_n = 0; gb_n < nsupers; gb_n ++) { - i = xsup_n[gb_n]; - if (iam == (int) OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { - k = lsub_s[j]; - if (k >= i) { - gb = supno_n[k]; - p = (int) PNUM( PROW(gb, grid), pc, grid ); - nnzToSend[2*p] ++; - send_1[p] = TRUE; - } - } - for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { - k = usub_s[j]; - if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { - gb = supno_n[k]; - p = PNUM( pr, PCOL(gb, grid), grid); - nnzToSend[2*p+1] ++; - send_2[p] = TRUE; - } - } - - nsend_2 = 0; - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - nnzToSend[2*p+1] += 2; - if (send_2[p]) nsend_2 ++; - } - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) - if (send_2[p] || p == p_diag) { - if (p == p_diag && !send_2[p]) - nnzToSend[2*p+1] += nsend_2; - else - nnzToSend[2*p+1] += nsend_2-1; - send_2[p] = FALSE; - } - nsend_1 = 0; - for (p = pc; p < nprocs; p += grid->npcol) { - nnzToSend[2*p] += 2; - if (send_1[p]) nsend_1 ++; - } - for (p = pc; p < nprocs; p += grid->npcol) - if (send_1[p]) { - nnzToSend[2*p] += nsend_1-1; - send_1[p] = FALSE; - } - else - nnzToSend[2*p] += nsend_1; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, - grid->comm); - - nnz_loc_l = nnz_loc_u = 0; - SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; - for (p = 0; p < nprocs; p++) { - if ( p != iam ) { - SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; - SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; - RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; - RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } else { - nnz_loc_l += nnzToRecv[2*p]; - nnz_loc_u += nnzToRecv[2*p+1]; - nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; - nnzToRecv_l[p] = nnzToRecv[2*p]; - nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } - } - - /* Allocate space for storing the symbolic structure after redistribution. */ - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for xlsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_j+1) * iword; - - if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for xusub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_i+1) * iword; - - /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ - if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { - if (!(rcv_luind = - intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { - fprintf (stderr, "Malloc fails for rcv_luind[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) - * iword; - } - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { - fprintf (stderr, "Malloc fails for index[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------------ - LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF L and U. - ------------------------------------------------------------------*/ - sendL = TRUE; - sendU = FALSE; - while (sendL || sendU) { - if (sendL) { - xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; - nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; - } - if (sendU) { - xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; - nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; - } - for (i = 0, j = 0, p = 0; p < nprocs; p++) { - if ( p != iam ) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - ptrToRecv[p] = j; j += nnzToRecv[p]; - } - nnzToRecv[iam] = 0; - - ind_loc = ptrToRecv[iam]; - for (gb_n = 0; gb_n < nsupers; gb_n++) { - nsend_2 = 0; - i = xsup_n[gb_n]; - if (iam == OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid ); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - - if (sendL) { - p = pc; np = grid->nprow; - } else { - p = pr * grid->npcol; np = grid->npcol; - } - for (j = 0; j < np; j++) { - if (p == iam) { - rcv_luind[ind_loc] = gb_n; - rcv_luind[ind_loc+1] = 0; - tmp_ptrToSend[p] = ind_loc + 1; - ind_loc += 2; - } - else { - snd_luind[ptrToSend[p]] = gb_n; - snd_luind[ptrToSend[p]+1] = 0; - tmp_ptrToSend[p] = ptrToSend[p] + 1; - ptrToSend[p] += 2; - } - if (sendL) p += grid->npcol; - if (sendU) p++; - } - for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { - k = sub_s[j]; - if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { - gb = supno_n[k]; - if (sendL) - p = PNUM( PROW(gb, grid), pc, grid ); - else - p = PNUM( pr, PCOL(gb, grid), grid); - if (send_1[p] == FALSE) { - send_1[p] = TRUE; - send_2[nsend_2] = k; nsend_2 ++; - } - if (p == iam) { - rcv_luind[ind_loc] = k; ind_loc++; - if (sendL) - xsub_n[LBj( gb_n, grid )] ++; - else - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = k; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - if (sendL) - for (p = pc; p < nprocs; p += grid->npcol) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if (PNUM(PROW(gb, grid), pc, grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBj( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - if (sendU) - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - if (send_1[p] || p == p_diag) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if(PNUM( pr, PCOL(gb, grid), grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - } - } - } - - /* reset ptrToSnd to point to the beginning of the data for - each processor (structure needed in MPI_Alltoallv) */ - for (i = 0, p = 0; p < nprocs; p++) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - Note: it uses MPI_Alltoallv. - ------------------------------------------------------------*/ - if (nprocs > 1) { -#if defined (_LONGINT) - nnzToSend[iam] = 0; - for (p=0; p INT_MAX || ptrToSend[p] > INT_MAX || - nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) nnzToSend[p]; - intBuf2[p] = (int) ptrToSend[p]; - intBuf3[p] = (int) nnzToRecv[p]; - intBuf4[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = nnzToSend; intBuf2 = ptrToSend; - intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; -#endif - - MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, - rcv_luind, intBuf3, intBuf4, mpi_int_t, - grid->comm); - } - if (sendL) - nnzToRecv[iam] = nnz_loc_l; - else - nnzToRecv[iam] = nnz_loc_u; - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE. - -------------------------------------------------------------*/ - if (sendU) - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - SUPERLU_FREE (snd_luind); - memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------ - CONVERT THE FORMAT. - ------------------------------------------------------------*/ - /* Initialize the array of column of L/ row of U pointers */ - k = 0; - for (p = 0; p < nprocs; p ++) { - if (p != iam) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - nelts = rcv_luind[i+1]; - if (sendL) - xsub_n[LBj( gb, grid )] = nelts; - else - xsub_n[LBi( gb, grid )] = nelts; - i += nelts + 2; - } - } - k += nnzToRecv[p]; - } - - if (sendL) j = nsupers_j; - else j = nsupers_i; - k = 0; - isize = xsub_n[0]; - xsub_n[0] = 0; - for (gb_l = 1; gb_l < j; gb_l++) { - k += isize; - isize = xsub_n[gb_l]; - xsub_n[gb_l] = k; - } - xsub_n[gb_l] = k + isize; - nnz_loc = xsub_n[gb_l]; - if (sendL) { - lsub_n = NULL; - if (nnz_loc) { - if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for lsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = lsub_n; - } - if (sendU) { - usub_n = NULL; - if (nnz_loc) { - if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for usub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = usub_n; - } - - /* Copy the data into the L column / U row oriented storage */ - k = 0; - for (p = 0; p < nprocs; p++) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - if (gb >= nsupers) - printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n", - iam, p, gb, nsupers, i, i-k); - i += 2; - if (sendL) gb_l = LBj( gb, grid ); - if (sendU) gb_l = LBi( gb, grid ); - for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { - sub_n[j] = rcv_luind[i]; - } - } - k += nnzToRecv[p]; - } - if (sendL) { - sendL = FALSE; sendU = TRUE; - } - else - sendU = FALSE; - } - - /* deallocate memory allocated during symbolic factorization routine */ - if (rcv_luind != NULL) { - SUPERLU_FREE (rcv_luind); - memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; - } - SUPERLU_FREE (mem); - memAux -= (float) (12 * nprocs * iword); - SUPERLU_FREE(nvtcs); - memAux -= (float) (5 * nprocs * sizeof(int)); - - if (xlsub_s != NULL) { - SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); - } - if (xusub_s != NULL) { - SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); - } - SUPERLU_FREE (globToLoc); - if (supno_s != NULL) { - SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); - SUPERLU_FREE (supno_s); - } - - Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; - *p_xlsub = xlsub_n; *p_lsub = lsub_n; - *p_xusub = xusub_n; *p_usub = usub_n; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dist_symbLU()"); -#endif - - return (-memRet); -} - -static float -zdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val, - int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val, - int_t *ilsum_i, int_t *ilsum_j - ) -{ -/* - * - * Purpose - * ======= - * Re-distribute A on the 2D process mesh. The lower part is - * stored using a column format and the upper part - * is stored using a row format. - * - * Arguments - * ========= - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_persist (Input) Glu_persist_t * - * Information on supernodes mapping. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * p_ainf_colptr (Output) int_t** - * Pointer to the lower part of A distributed on a 2D grid - * of processors, stored by columns. - * - * p_ainf_rowind (Output) int_t** - * Structure of of the lower part of A distributed on a - * 2D grid of processors, stored by columns. - * - * p_ainf_val (Output) doublecomplex** - * Numerical values of the lower part of A, distributed on a - * 2D grid of processors, stored by columns. - * - * p_asup_rowptr (Output) int_t** - * Pointer to the upper part of A distributed on a 2D grid - * of processors, stored by rows. - * - * p_asup_colind (Output) int_t** - * Structure of of the upper part of A distributed on a - * 2D grid of processors, stored by rows. - * - * p_asup_val (Output) doublecomplex** - * Numerical values of the upper part of A, distributed on a - * 2D grid of processors, stored by rows. - * - * ilsum_i (Input) int_t * - * Starting position of each supernode in - * the full array (local, block row wise). - * - * ilsum_j (Input) int_t * - * Starting position of each supernode in - * the full array (local, block column wise). - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated when out of memory. - * (an approximation). - * - */ - int iam, p, procs; - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; - int_t nsupers, nsupers_i, nsupers_j; - int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - doublecomplex *asup_val, *ainf_val; - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - doublecomplex *aij, **aij_send, *nzval, *dtemp; - doublecomplex *nzval_a; - MPI_Request *send_req; - MPI_Status status; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword, szbuf; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter zdist_A()"); -#endif - iword = sizeof(int_t); - dword = sizeof(double); - - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - if (!(nnzToRecv = intCalloc_dist(2*procs))) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (ERROR_RET); - } - memAux = (float) (2 * procs * iword); - memRet = 0.; - nnzToSend = nnzToRecv + procs; - nsupers = supno[n-1] + 1; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); - - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - szbuf = k; - - /* Allocate space for storing the triplets after redistribution. */ - if ( !(ia = intMalloc_dist(2*k)) ) { - fprintf (stderr, "Malloc fails for ia[]."); - return (memAux); - } - memAux += (float) (2*k*iword); - ja = ia + k; - if ( !(aij = doublecomplexMalloc_dist(k)) ) { - fprintf (stderr, "Malloc fails for aij[]."); - return (memAux); - } - memAux += (float) (k*dword); - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { - fprintf (stderr, "Malloc fails for send_req[]."); - return (memAux); - } - memAux += (float) (2*procs *sizeof(MPI_Request)); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ia_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(int_t*)); - if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) { - fprintf(stderr, "Malloc fails for aij_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(doublecomplex*)); - if ( !(index = intMalloc_dist(2*SendCnt)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memAux); - } - memAux += (float) (2*SendCnt*iword); - if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) { - fprintf(stderr, "Malloc fails for nzval[]."); - return (memAux); - } - memAux += (float) (SendCnt * dword); - if ( !(ptr_to_send = intCalloc_dist(procs)) ) { - fprintf(stderr, "Malloc fails for ptr_to_send[]."); - return (memAux); - } - memAux += (float) (procs * iword); - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for itemp[]."); - return (memAux); - } - memAux += (float) (2*maxnnzToRecv*iword); - if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for dtemp[]."); - return (memAux); - } - memAux += (float) (maxnnzToRecv * dword); - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { - fprintf (stderr, "Malloc fails for *ainf_colptr[]."); - return (memAux); - } - memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; - if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { - fprintf (stderr, "Malloc fails for *asup_rowptr[]."); - return (memAux+memRet); - } - memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nnz_loc_ainf = nnz_loc_asup = 0; - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - irow = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /* assert(jcol= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - MPI_Wait( &send_req[p], &status); - MPI_Wait( &send_req[procs+p], &status); - } - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - - SUPERLU_FREE(nnzToRecv); - memAux -= 2 * procs * iword; - if ( procs > 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - SUPERLU_FREE(ptr_to_send); - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + - procs*sizeof(doublecomplex*) + 2*SendCnt * iword + - SendCnt* dword + procs*iword + - 2*maxnnzToRecv*iword + maxnnzToRecv*dword; - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT. - ------------------------------------------------------------*/ - if (nnz_loc_ainf != 0) { - if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_rowind[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * iword); - if ( !(ainf_val = doublecomplexMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_val[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * dword); - } - else { - ainf_rowind = NULL; - ainf_val = NULL; - } - if (nnz_loc_asup != 0) { - if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_colind[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * iword); - if ( !(asup_val = doublecomplexMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_val[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * dword); - } - else { - asup_colind = NULL; - asup_val = NULL; - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = ainf_colptr[0]; ainf_colptr[0] = 0; - for (j = 1; j < ilsum_j[nsupers_j]; j++) { - k += jsize; - jsize = ainf_colptr[j]; - ainf_colptr[j] = k; - } - ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; - i = 0; - isize = asup_rowptr[0]; asup_rowptr[0] = 0; - for (j = 1; j < ilsum_i[nsupers_i]; j++) { - i += isize; - isize = asup_rowptr[j]; - asup_rowptr[j] = i; - } - asup_rowptr[ilsum_i[nsupers_i]] = i + isize; - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - jcol = ja[i]; - irow = ia[i]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); - k = ainf_colptr[j]; - ainf_rowind[k] = irow; - ainf_val[k] = aij[i]; - ainf_colptr[j] ++; - } - else { - j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); - k = asup_rowptr[j]; - asup_colind[k] = jcol; - asup_val[k] = aij[i]; - asup_rowptr[j] ++; - } - } - - /* Reset the column pointers to the beginning of each column */ - for (j = ilsum_j[nsupers_j]; j > 0; j--) - ainf_colptr[j] = ainf_colptr[j-1]; - for (j = ilsum_i[nsupers_i]; j > 0; j--) - asup_rowptr[j] = asup_rowptr[j-1]; - ainf_colptr[0] = 0; - asup_rowptr[0] = 0; - - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - memAux -= 2*szbuf*iword + szbuf*dword; - - *p_ainf_colptr = ainf_colptr; - *p_ainf_rowind = ainf_rowind; - *p_ainf_val = ainf_val; - *p_asup_rowptr = asup_rowptr; - *p_asup_colind = asup_colind; - *p_asup_val = asup_val; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit zdist_A()"); - fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); -#endif - - return (-memRet); -} /* dist_A */ - -int_t -zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Pslu_freeable_t *Pslu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * - * - * Purpose - * ======= - * Distribute the input matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * This routine should not be called for this case, an error - * is generated. Instead, pddistribute routine should be called. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (Input) int - * Dimension of the matrix. - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. - * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (Input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (Input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated for performing the distribution - * of the data, when out of memory. - * (an approximation). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t Glu_freeable_n; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, - len, len1, nsupc, nsupc_gb, ii, nprocs; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - doublecomplex *a; - int_t *asub, *xa; - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - doublecomplex *asup_val, *ainf_val; - int_t *xsup, *supno; /* supernode and column mapping */ - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; - int_t next_ind; /* next available position in index[*] */ - int_t next_val; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - doublecomplex *lusup, *uval; /* nonzero values in L and U */ - int_t *recvBuf; - int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; - doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in - the full array (local, block column wise) */ - /*-- Auxiliary arrays; freed on return --*/ - int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *LUb_length; /* L,U block length; size nsupers_ij */ - int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ - int_t *LUb_number; /* global block number; size nsupers_ij */ - int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ - int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ - doublecomplex zero = {0.0, 0.0}; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; - float memStrLU, memA, - memDist = 0.; /* memory used for redistributing the data, which does - not include the memory for the numerical values of L and U */ - float memNLU = 0.; /* memory allocated for storing the numerical values of - L and U, that will be used in the numeric factorization */ - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif - - /* Initialization. */ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); -#endif - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nprocs = grid->npcol * grid->nprow; - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - Astore = (NRformat_loc *) A->Store; - - iword = sizeof(int_t); - dword = sizeof(doublecomplex); - - if (fact == SamePattern_SameRowPerm) { - ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); - } - - if ((memStrLU = - dist_symbLU (n, Pslu_freeable, - Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) - return (memStrLU); - memDist += (-memStrLU); - xsup = Glu_persist->xsup; /* supernode and column mapping */ - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ - nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ - nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); - if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for ilsum[]."); - return (memDist + memNLU); - } - memNLU += (nsupers_i+1) * iword; - if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for ilsum_j[]."); - return (memDist + memNLU); - } - memDist += (nsupers_j+1) * iword; - - /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ - ilsum[0] = 0; - ldaspa = 0; - for (gb = 0; gb < nsupers; gb++) - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - ilsum[nsupers_i] = ldaspa; - - ldaspa_j = 0; ilsum_j[0] = 0; - for (gb = 0; gb < nsupers; gb++) - if (mycol == PCOL( gb, grid )) { - i = SuperSize( gb ); - ldaspa_j += i; - lb = LBj( gb, grid ); - ilsum_j[lb + 1] = ilsum_j[lb] + i; - } - ilsum_j[nsupers_j] = ldaspa_j; - - if ((memA = zdist_A(A, ScalePermstruct, Glu_persist, - grid, &ainf_colptr, &ainf_rowind, &ainf_val, - &asup_rowptr, &asup_colind, &asup_val, - ilsum, ilsum_j)) > 0) - return (memDist + memA + memNLU); - memDist += (-memA); - - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ - - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - if ( !(ToRecv = intCalloc_dist(nsupers)) ) { - fprintf(stderr, "Calloc fails for ToRecv[]."); - return (memDist + memNLU); - } - memNLU += nsupers * iword; - - k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ToSendR[]."); - return (memDist + memNLU); - } - memNLU += k*sizeof(int_t*); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memDist + memNLU); - } - memNLU += j*iword; - - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - - /* Auxiliary arrays used to set up L and U block data structures. - They are freed on return. */ - if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_length[]."); - return (memDist + memNLU); - } - if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Malloc fails for LUb_indptr[]."); - return (memDist + memNLU); - } - if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_number[]."); - return (memDist + memNLU); - } - if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_valptr[]."); - return (memDist + memNLU); - } - memDist += 4 * nsupers_ij * iword; - - k = CEILING( nsupers, grid->nprow ); - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (doublecomplex**)SUPERLU_MALLOC(nsupers_i * sizeof(doublecomplex*))) ) { - fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); - return (memDist + memNLU); - } - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*sizeof(doublecomplex*) + nsupers_i*sizeof(int_t*); - Unzval_br_ptr[nsupers_i-1] = NULL; - Ufstnz_br_ptr[nsupers_i-1] = NULL; - - if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Malloc fails for ToSendD[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*iword; - if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - memDist += (nsupers_i + nsupers_j)*iword; - - /* Auxiliary arrays used to set up L, U block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) - * sp_ienv_dist(3))) ) { - fprintf(stderr, "Calloc fails for SPA dense[]."); - return (memDist + memNLU); - } - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for fmod[]."); - return (memDist + memNLU); - } - if ( !(bmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for bmod[]."); - return (memDist + memNLU); - } - /* ------------------------------------------------ */ - memNLU += 2*nsupers_i*iword + - SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); - return (memDist + memNLU); - } - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_j * sizeof(doublecomplex*) + nsupers_j * sizeof(int_t*); - Lnzval_bc_ptr[nsupers_j-1] = NULL; - Lrowind_bc_ptr[nsupers_j-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[]."); - return (memDist + memNLU); - } - len = nsupers_j * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[]."); - return (memDist + memNLU); - } - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ - memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid); /* Local block number row wise */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - if ( myrow == jbrow ) { /* Block row jb in my process row */ - /* Scatter A into SPA. */ - for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { - for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { - if (i >= asup_rowptr[ilsum[nsupers_i]]) - printf ("ERR7\n"); - jcol = asup_colind[i]; - if (jcol >= n) - printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - gb = BlockNum( jcol ); - lb = LBj( gb, grid ); - if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); - jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); - if (jcol >= ldaspa_j) - printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - dense_col[jcol] = asup_val[i]; - } - dense_col += ldaspa_j; - } - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - /* Count number of blocks and length of each block. */ - nrbu = 0; - len = 0; /* Number of column subscripts I own. */ - len1 = 0; /* number of fstnz subscripts */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - if (i >= xusub[nsupers_i]) printf ("ERR10\n"); - jcol = usub[i]; - gb = BlockNum( jcol ); /* Global block number */ - - /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) - printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", - iam, jb, gb, jcol, jbcol, pc); */ - - lb = LBj( gb, grid ); /* Local block number */ - pc = PCOL( gb, grid ); /* Process col owning this block */ - if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; - /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ - pr = PROW( gb, grid ); - if ( pr != jbrow && mycol == pc) - bsendx_plist[lb][jbrow] = YES; - if (mycol == pc) { - len += nsupc; - LUb_length[lb] += nsupc; - ToSendD[ljb_i] = YES; - if (Urb_marker[lb] <= jb) { /* First see this block */ - if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; - Urb_marker[lb] = jb + 1; - LUb_number[nrbu] = gb; - /* if (gb == 391825 && jb == 145361) - printf ("Pe[%d] T1 [%d %d] nrbu %d \n", - iam, jb, gb, nrbu); */ - nrbu ++; - len1 += SuperSize( gb ); - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[ljb_i];/* Mod. count for back solve */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - } - } /* for i ... */ - - if ( nrbu ) { - /* Sort the blocks of U in increasing block column index. - SuperLU_DIST assumes this is true */ - /* simple insert sort algorithm */ - /* to be transformed in quick sort */ - for (j = 1; j < nrbu; j++) { - k = LUb_number[j]; - for (i=j-1; i>=0 && LUb_number[i] > k; i--) { - LUb_number[i+1] = LUb_number[i]; - } - LUb_number[i+1] = k; - } - - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) { - fprintf (stderr, "Malloc fails for Uindex[]"); - return (memDist + memNLU); - } - Ufstnz_br_ptr[ljb_i] = index; - if (!(Unzval_br_ptr[ljb_i] = - doublecomplexMalloc_dist(len))) { - fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); - return (memDist + memNLU); - } - memNLU += (len1+1)*iword + len*dword; - uval = Unzval_br_ptr[ljb_i]; - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = nrbu; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index */ - index[len1] = -1; /* End marker */ - next_ind = BR_HEADER; - next_val = 0; - for (k = 0; k < nrbu; k++) { - gb = LUb_number[k]; - lb = LBj( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; /* Reset vector of block length */ - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) - index[next_ind] = FstBlockC( jb + 1 ); - LUb_valptr[lb] = next_val; - next_val += len; - } - /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], - and the initial values of A from SPA into Unzval_br_ptr[]. */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - jcol = usub[i]; - gb = BlockNum( jcol ); - - if ( mycol == PCOL( gb, grid ) ) { - lb = LBj( gb, grid ); - k = LUb_indptr[lb]; /* Start fstnz in index */ - index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); - } - } /* for i ... */ - - for (i = 0; i < nrbu; i++) { - gb = LUb_number[i]; - lb = LBj( gb, grid ); - next_ind = LUb_indptr[lb]; - k = FstBlockC( jb + 1); - jcol = ilsum_j[lb]; - for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { - dense_col = dense; - j = index[next_ind+jj]; - for (ii = j; ii < k; ii++) { - uval[LUb_valptr[lb]++] = dense_col[jcol]; - dense_col[jcol] = zero; - dense_col += ldaspa_j; - } - } - } - } else { - Ufstnz_br_ptr[ljb_i] = NULL; - Unzval_br_ptr[ljb_i] = NULL; - } /* if nrbu ... */ - } /* if myrow == jbrow */ - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - if (mycol == jbcol) { /* Block column jb in my process column */ - /* Scatter A_inf into SPA. */ - for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { - for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { - irow = ainf_rowind[i]; - if (irow >= n) printf ("Pe[%d] ERR1\n", iam); - gb = BlockNum( irow ); - if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); - dense_col[irow] = ainf_val[i]; - } - } - dense_col += ldaspa; - } - - /* sort the indices of the diagonal block at the beginning of xlsub */ - if (myrow == jbrow) { - k = xlsub[ljb_j]; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - if (irow < nsupc + fsupc && i != k+irow-fsupc) { - lsub[i] = lsub[k + irow - fsupc]; - lsub[k + irow - fsupc] = irow; - i --; - } - } - } - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && - myrow == jbrow) { - fsendx_plist[ljb_j][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (Lrb_marker[lb] <= jb) { /* First see this block */ - Lrb_marker[lb] = jb + 1; - LUb_length[lb] = 1; - LUb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else - ++LUb_length[lb]; - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* If I am the owner of the diagonal block, order it first in LUb_number. - Necessary for SuperLU_DIST routines */ - kseen = EMPTY; - for (j = 0; j < nrbl; j++) { - if (LUb_number[j] == jb) - kseen = j; - } - if (kseen != EMPTY && kseen != 0) { - LUb_number[kseen] = LUb_number[0]; - LUb_number[0] = jb; - } - - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) { - fprintf (stderr, "Malloc fails for index[]"); - return (memDist + memNLU); - } - Lrowind_bc_ptr[ljb_j] = index; - if (!(Lnzval_bc_ptr[ljb_j] = - doublecomplexMalloc_dist(len*nsupc))) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb); - return (memDist + memNLU); - } - memNLU += len1*iword + len*nsupc*dword; - - lusup = Lnzval_bc_ptr[ljb_j]; - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_ind = BC_HEADER; - next_val = 0; - for (k = 0; k < nrbl; ++k) { - gb = LUb_number[k]; - lb = LBi( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - LUb_valptr[lb] = next_val; - next_ind += len; - next_val += len; - } - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - len = index[1]; /* LDA of lusup[] */ - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = LUb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = LUb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb_j] = NULL; - Lnzval_bc_ptr[ljb_j] = NULL; - } /* if nrbl ... */ - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(ilsum_j); - SUPERLU_FREE(Urb_marker); - SUPERLU_FREE(LUb_length); - SUPERLU_FREE(LUb_indptr); - SUPERLU_FREE(LUb_number); - SUPERLU_FREE(LUb_valptr); - SUPERLU_FREE(Lrb_marker); - SUPERLU_FREE(dense); - - /* Free the memory used for storing L and U */ - SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); - if (lsub != NULL) - SUPERLU_FREE(lsub); - if (usub != NULL) - SUPERLU_FREE(usub); - - /* Free the memory used for storing A */ - SUPERLU_FREE(ainf_colptr); - if (ainf_rowind != NULL) { - SUPERLU_FREE(ainf_rowind); - SUPERLU_FREE(ainf_val); - } - SUPERLU_FREE(asup_rowptr); - if (asup_colind != NULL) { - SUPERLU_FREE(asup_colind); - SUPERLU_FREE(asup_val); - } - - /* exchange information about bsendx_plist in between column of processors */ - k = SUPERLU_MAX( grid->nprow, grid->npcol); - if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { - fprintf (stderr, "Malloc fails for recvBuf[]."); - return (memDist + memNLU); - } - if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - - if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) - memDist = nsupers*k*iword +4*nprocs * sizeof(int); - - for (p = 0; p < nprocs; p++) - nnzToRecv[p] = 0; - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - nnzToRecv[p] += grid->npcol; - } - i = 0; - for (p = 0; p < nprocs; p++) { - ptrToRecv[p] = i; - i += nnzToRecv[p]; - ptrToSend[p] = 0; - if (p != iam) - nnzToSend[p] = nnzToRecv[iam]; - else - nnzToSend[p] = 0; - } - nnzToRecv[iam] = 0; - i = ptrToRecv[iam]; - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - if (p == iam) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - for (j = 0; j < grid->npcol; j++, i++) - recvBuf[i] = ToSendR[ljb_j][j]; - } - } - - MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, - recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid ); /* Local block number row wise */ - /* (myrow == jbrow) { - if (ToSendD[ljb_i] == YES) - ToRecv[jb] = 1; - } - else { - if (recvBuf[ptrToRecv[p] + mycol] == YES) - ToRecv[jb] = 2; - } */ - if (recvBuf[ptrToRecv[p] + mycol] == YES) { - if (myrow == jbrow) - ToRecv[jb] = 1; - else - ToRecv[jb] = 2; - } - if (mycol == jbcol) { - for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) - ToSendR[ljb_j][i] = recvBuf[j]; - ToSendR[ljb_j][mycol] = EMPTY; - } - ptrToRecv[p] += grid->npcol; - } - - /* exchange information about bsendx_plist in between column of processors */ - MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, - MPI_MAX, grid->cscp.comm); - - for (jb = 0; jb < nsupers; jb ++) { - jbcol = PCOL( jb, grid); - jbrow = PROW( jb, grid); - if (mycol == jbcol) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - if (myrow == jbrow ) { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { - (*bsendx_plist)[k] = recvBuf[k]; - if ((*bsendx_plist)[k] != EMPTY) - nbsendx ++; - } - } - else { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) - (*bsendx_plist)[k] = EMPTY; - } - } - } - - SUPERLU_FREE(nnzToRecv); - SUPERLU_FREE(ptrToRecv); - SUPERLU_FREE(nnzToSend); - SUPERLU_FREE(ptrToSend); - SUPERLU_FREE(recvBuf); - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - LUstruct->Glu_persist = Glu_persist; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist, - ToRecv, ToSendR, ToSendD - */ - CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); -#endif - - return (- (memDist+memNLU)); -} /* dist_psymbtonum */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,1939 +0,0 @@ - -/* - * -- Parallel symbolic factorization auxialiary routine (version 2.1) -- - * -- Distributes the data from parallel symbolic factorization - * -- to numeric factorization - * INRIA France - July 1, 2004 - * Laura Grigori - * - * November 1, 2007 - */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include - -#include "superlu_zdefs.h" - -static float -dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, - Glu_persist_t *Glu_persist, - int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, - gridinfo_t *grid - ) -/* - * Purpose - * ======= - * - * Redistribute the symbolic structure of L and U from the distribution - * used in the parallel symbolic factorization step to the distdibution - * used in the parallel numeric factorization step. On exit, the L and U - * structure for the 2D distribution used in the numeric factorization step is - * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal - * information is also computed and it is stored in Glu_persist->supno - * and Glu_persist->xsup. - * - * This routine allocates memory for storing the structure of L and U - * and the supernodes information. This represents the arrays: - * p_xlsub, p_lsub, p_xusub, p_usub, - * Glu_persist->supno, Glu_persist->xsup. - * - * This routine also deallocates memory allocated during symbolic - * factorization routine. That is, the folloing arrays are freed: - * Pslu_freeable->xlsub, Pslu_freeable->lsub, - * Pslu_freeable->xusub, Pslu_freeable->usub, - * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, - * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc. - * - * Arguments - * ========= - * - * n (Input) int_t - * Order of the input matrix - * Pslu_freeable (Input) Pslu_freeable_t * - * Local L and U structure, - * global to local indexing information. - * - * Glu_persist (Output) Glu_persist_t * - * Stores on output the information on supernodes mapping. - * - * p_xlsub (Output) int_t ** - * Pointer to structure of L distributed on a 2D grid - * of processors, stored by columns. - * - * p_lsub (Output) int_t ** - * Structure of L distributed on a 2D grid of processors, - * stored by columns. - * - * p_xusub (Output) int_t ** - * Pointer to structure of U distributed on a 2D grid - * of processors, stored by rows. - * - * p_usub (Output) int_t ** - * Structure of U distributed on a 2D grid of processors, - * stored by rows. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU. - * > 0, number of bytes allocated in this routine when out of memory. - * (an approximation). - */ -{ - int iam, nprocs, pc, pr, p, np, p_diag; - int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, - *tmp_ptrToSend, *mem; - int_t *nnzToRecv_l, *nnzToRecv_u; - int_t *send_1, *send_2, nsend_1, nsend_2; - int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; - int_t nsupers, nsupers_i, nsupers_j; - int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; - int_t maxszsn, maxNvtcsPProc; - int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; - int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; - int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; - int_t *xsub_s, *sub_s, *xsub_n, *sub_n; - int_t *globToLoc, nvtcs_loc; - int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, - RecvCnt_l, RecvCnt_u, ind_loc; - int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; - int_t nelts, isize; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_symbLU()"); -#endif - nprocs = (int) grid->nprow * grid->npcol; - xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; - xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; - maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; - globToLoc = Pslu_freeable->globToLoc; - nvtcs_loc = Pslu_freeable->nvtcs_loc; - xsup_beg_s = Pslu_freeable->xsup_beg_loc; - xsup_end_s = Pslu_freeable->xsup_end_loc; - supno_s = Pslu_freeable->supno_loc; - rcv_luind = NULL; - iword = sizeof(int_t); - dword = sizeof(doublecomplex); - memAux = 0.; memRet = 0.; - - mem = intCalloc_dist(12 * nprocs); - if (!mem) - return (ERROR_RET); - memAux = (float) (12 * nprocs * sizeof(int_t)); - nnzToRecv = mem; - nnzToSend = nnzToRecv + 2*nprocs; - nnzToSend_l = nnzToSend + 2 * nprocs; - nnzToSend_u = nnzToSend_l + nprocs; - send_1 = nnzToSend_u + nprocs; - send_2 = send_1 + nprocs; - tmp_ptrToSend = send_2 + nprocs; - nnzToRecv_l = tmp_ptrToSend + nprocs; - nnzToRecv_u = nnzToRecv_l + nprocs; - - ptrToSend = nnzToSend; - ptrToRecv = nnzToSend + nprocs; - - nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); - intBuf1 = nvtcs + nprocs; - intBuf2 = nvtcs + 2 * nprocs; - intBuf3 = nvtcs + 3 * nprocs; - intBuf4 = nvtcs + 4 * nprocs; - memAux += 5 * nprocs * sizeof(int); - - maxszsn = sp_ienv_dist(3); - - /* Allocate space for storing Glu_persist_n. */ - if ( !(supno_n = intMalloc_dist(n+1)) ) { - fprintf (stderr, "Malloc fails for supno_n[]."); - return (memAux); - } - memRet += (float) ((n+1) * sizeof(int_t)); - - /* ------------------------------------------------------------ - DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION - ------------------------------------------------------------*/ - - if (nvtcs_loc > INT_MAX) - ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); - intNvtcs_loc = (int) nvtcs_loc; - MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, - 0, grid->comm); - - if (!iam) { - /* set ptrToRecv to point to the beginning of the data for - each processor */ - for (k = 0, p = 0; p < nprocs; p++) { - ptrToRecv[p] = k; - k += nvtcs[p]; - } - } - - if (nprocs > 1) { - temp = NULL; - if (!iam ) { - if ( !(temp = intMalloc_dist (n+1)) ) { - fprintf (stderr, "Malloc fails for temp[]."); - return (memAux + memRet); - } - memAux += (float) (n+1) * iword; - } -#if defined (_LONGINT) - for (p=0; p INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = ptrToRecv; -#endif - MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, - temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); - } - else - temp = supno_s; - - if (!iam) { - nsupers = 0; - p = (int) OWNER( globToLoc[0] ); - gb = temp[ptrToRecv[p]]; - supno_n[0] = nsupers; - ptrToRecv[p] ++; - szsn = 1; - for (j = 1; j < n; j ++) { - if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { - nsupers ++; - p = (int) OWNER( globToLoc[j] ); - gb = temp[ptrToRecv[p]]; - szsn = 1; - } - else { - szsn ++; - } - ptrToRecv[p] ++; - supno_n[j] = nsupers; - } - nsupers++; - if (nprocs > 1) { - SUPERLU_FREE (temp); - memAux -= (float) (n+1) * iword; - } - supno_n[n] = nsupers; - } - - /* reset to 0 nnzToSend */ - for (p = 0; p < 2 *nprocs; p++) - nnzToSend[p] = 0; - - MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); - nsupers = supno_n[n]; - /* Allocate space for storing Glu_persist_n. */ - if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { - fprintf (stderr, "Malloc fails for xsup_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers+1) * iword; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF L and U. - ------------------------------------------------------------*/ - gb = EMPTY; - for (i = 0; i < n; i++) { - if (gb != supno_n[i]) { - /* a new supernode starts */ - gb = supno_n[i]; - xsup_n[gb] = i; - } - } - xsup_n[nsupers] = n; - - for (p = 0; p < nprocs; p++) { - send_1[p] = FALSE; - send_2[p] = FALSE; - } - for (gb_n = 0; gb_n < nsupers; gb_n ++) { - i = xsup_n[gb_n]; - if (iam == (int) OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { - k = lsub_s[j]; - if (k >= i) { - gb = supno_n[k]; - p = (int) PNUM( PROW(gb, grid), pc, grid ); - nnzToSend[2*p] ++; - send_1[p] = TRUE; - } - } - for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { - k = usub_s[j]; - if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { - gb = supno_n[k]; - p = PNUM( pr, PCOL(gb, grid), grid); - nnzToSend[2*p+1] ++; - send_2[p] = TRUE; - } - } - - nsend_2 = 0; - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - nnzToSend[2*p+1] += 2; - if (send_2[p]) nsend_2 ++; - } - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) - if (send_2[p] || p == p_diag) { - if (p == p_diag && !send_2[p]) - nnzToSend[2*p+1] += nsend_2; - else - nnzToSend[2*p+1] += nsend_2-1; - send_2[p] = FALSE; - } - nsend_1 = 0; - for (p = pc; p < nprocs; p += grid->npcol) { - nnzToSend[2*p] += 2; - if (send_1[p]) nsend_1 ++; - } - for (p = pc; p < nprocs; p += grid->npcol) - if (send_1[p]) { - nnzToSend[2*p] += nsend_1-1; - send_1[p] = FALSE; - } - else - nnzToSend[2*p] += nsend_1; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, - grid->comm); - - nnz_loc_l = nnz_loc_u = 0; - SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; - for (p = 0; p < nprocs; p++) { - if ( p != iam ) { - SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; - SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; - RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; - RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } else { - nnz_loc_l += nnzToRecv[2*p]; - nnz_loc_u += nnzToRecv[2*p+1]; - nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; - nnzToRecv_l[p] = nnzToRecv[2*p]; - nnzToRecv_u[p] = nnzToRecv[2*p+1]; - } - } - - /* Allocate space for storing the symbolic structure after redistribution. */ - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for xlsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_j+1) * iword; - - if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for xusub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nsupers_i+1) * iword; - - /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ - if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { - if (!(rcv_luind = - intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { - fprintf (stderr, "Malloc fails for rcv_luind[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) - * iword; - } - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { - fprintf (stderr, "Malloc fails for index[]."); - return (memAux + memRet); - } - memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------------ - LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF L and U. - ------------------------------------------------------------------*/ - sendL = TRUE; - sendU = FALSE; - while (sendL || sendU) { - if (sendL) { - xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; - nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; - } - if (sendU) { - xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; - nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; - } - for (i = 0, j = 0, p = 0; p < nprocs; p++) { - if ( p != iam ) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - ptrToRecv[p] = j; j += nnzToRecv[p]; - } - nnzToRecv[iam] = 0; - - ind_loc = ptrToRecv[iam]; - for (gb_n = 0; gb_n < nsupers; gb_n++) { - nsend_2 = 0; - i = xsup_n[gb_n]; - if (iam == OWNER( globToLoc[i] )) { - pc = PCOL( gb_n, grid ); - pr = PROW( gb_n, grid ); - p_diag = PNUM( pr, pc, grid ); - - i_loc = LOCAL_IND( globToLoc[i] ); - gb_s = supno_s[i_loc]; - fst_s = xsup_beg_s[gb_s]; - lst_s = xsup_end_s[gb_s]; - fst_s_l = LOCAL_IND( globToLoc[fst_s] ); - - if (sendL) { - p = pc; np = grid->nprow; - } else { - p = pr * grid->npcol; np = grid->npcol; - } - for (j = 0; j < np; j++) { - if (p == iam) { - rcv_luind[ind_loc] = gb_n; - rcv_luind[ind_loc+1] = 0; - tmp_ptrToSend[p] = ind_loc + 1; - ind_loc += 2; - } - else { - snd_luind[ptrToSend[p]] = gb_n; - snd_luind[ptrToSend[p]+1] = 0; - tmp_ptrToSend[p] = ptrToSend[p] + 1; - ptrToSend[p] += 2; - } - if (sendL) p += grid->npcol; - if (sendU) p++; - } - for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { - k = sub_s[j]; - if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { - gb = supno_n[k]; - if (sendL) - p = PNUM( PROW(gb, grid), pc, grid ); - else - p = PNUM( pr, PCOL(gb, grid), grid); - if (send_1[p] == FALSE) { - send_1[p] = TRUE; - send_2[nsend_2] = k; nsend_2 ++; - } - if (p == iam) { - rcv_luind[ind_loc] = k; ind_loc++; - if (sendL) - xsub_n[LBj( gb_n, grid )] ++; - else - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = k; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - if (sendL) - for (p = pc; p < nprocs; p += grid->npcol) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if (PNUM(PROW(gb, grid), pc, grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBj( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - if (sendU) - for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { - if (send_1[p] || p == p_diag) { - for (k = 0; k < nsend_2; k++) { - gb = supno_n[send_2[k]]; - if(PNUM( pr, PCOL(gb, grid), grid) != p) { - if (p == iam) { - rcv_luind[ind_loc] = send_2[k]; ind_loc++; - xsub_n[LBi( gb_n, grid )] ++; - } - else { - snd_luind[ptrToSend[p]] = send_2[k]; - ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; - } - } - } - send_1[p] = FALSE; - } - } - } - } - - /* reset ptrToSnd to point to the beginning of the data for - each processor (structure needed in MPI_Alltoallv) */ - for (i = 0, p = 0; p < nprocs; p++) { - ptrToSend[p] = i; i += nnzToSend[p]; - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - Note: it uses MPI_Alltoallv. - ------------------------------------------------------------*/ - if (nprocs > 1) { -#if defined (_LONGINT) - for (p=0; p INT_MAX || ptrToSend[p] > INT_MAX || - nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) - ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); - intBuf1[p] = (int) nnzToSend[p]; - intBuf2[p] = (int) ptrToSend[p]; - intBuf3[p] = (int) nnzToRecv[p]; - intBuf4[p] = (int) ptrToRecv[p]; - } -#else /* Default */ - intBuf1 = nnzToSend; intBuf2 = ptrToSend; - intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; -#endif - - MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, - rcv_luind, intBuf3, intBuf4, mpi_int_t, - grid->comm); - } - if (sendL) - nnzToRecv[iam] = nnz_loc_l; - else - nnzToRecv[iam] = nnz_loc_u; - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE. - -------------------------------------------------------------*/ - if (sendU) - if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { - SUPERLU_FREE (snd_luind); - memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; - } - - /* ------------------------------------------------------------ - CONVERT THE FORMAT. - ------------------------------------------------------------*/ - /* Initialize the array of column of L/ row of U pointers */ - k = 0; - for (p = 0; p < nprocs; p ++) { - if (p != iam) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - nelts = rcv_luind[i+1]; - if (sendL) - xsub_n[LBj( gb, grid )] = nelts; - else - xsub_n[LBi( gb, grid )] = nelts; - i += nelts + 2; - } - } - k += nnzToRecv[p]; - } - - if (sendL) j = nsupers_j; - else j = nsupers_i; - k = 0; - isize = xsub_n[0]; - xsub_n[0] = 0; - for (gb_l = 1; gb_l < j; gb_l++) { - k += isize; - isize = xsub_n[gb_l]; - xsub_n[gb_l] = k; - } - xsub_n[gb_l] = k + isize; - nnz_loc = xsub_n[gb_l]; - if (sendL) { - lsub_n = NULL; - if (nnz_loc) { - if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for lsub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = lsub_n; - } - if (sendU) { - usub_n = NULL; - if (nnz_loc) { - if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { - fprintf (stderr, "Malloc fails for usub_n[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc * iword); - } - sub_n = usub_n; - } - - /* Copy the data into the L column / U row oriented storage */ - k = 0; - for (p = 0; p < nprocs; p++) { - i = k; - while (i < k + nnzToRecv[p]) { - gb = rcv_luind[i]; - if (gb >= nsupers) - printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n", - iam, p, gb, nsupers, i, i-k); - i += 2; - if (sendL) gb_l = LBj( gb, grid ); - if (sendU) gb_l = LBi( gb, grid ); - for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { - sub_n[j] = rcv_luind[i]; - } - } - k += nnzToRecv[p]; - } - if (sendL) { - sendL = FALSE; sendU = TRUE; - } - else - sendU = FALSE; - } - - /* deallocate memory allocated during symbolic factorization routine */ - if (rcv_luind != NULL) { - SUPERLU_FREE (rcv_luind); - memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; - } - SUPERLU_FREE (mem); - memAux -= (float) (12 * nprocs * iword); - SUPERLU_FREE(nvtcs); - memAux -= (float) (5 * nprocs * sizeof(int)); - - if (xlsub_s != NULL) { - SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); - } - if (xusub_s != NULL) { - SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); - } - SUPERLU_FREE (globToLoc); - if (supno_s != NULL) { - SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); - SUPERLU_FREE (supno_s); - } - - Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; - *p_xlsub = xlsub_n; *p_lsub = lsub_n; - *p_xusub = xusub_n; *p_usub = usub_n; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dist_symbLU()"); -#endif - - return (-memRet); -} - -static float -zdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val, - int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val, - int_t *ilsum_i, int_t *ilsum_j - ) -{ -/* - * - * Purpose - * ======= - * Re-distribute A on the 2D process mesh. The lower part is - * stored using a column format and the upper part - * is stored using a row format. - * - * Arguments - * ========= - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_persist (Input) Glu_persist_t * - * Information on supernodes mapping. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * p_ainf_colptr (Output) int_t** - * Pointer to the lower part of A distributed on a 2D grid - * of processors, stored by columns. - * - * p_ainf_rowind (Output) int_t** - * Structure of of the lower part of A distributed on a - * 2D grid of processors, stored by columns. - * - * p_ainf_val (Output) doublecomplex** - * Numerical values of the lower part of A, distributed on a - * 2D grid of processors, stored by columns. - * - * p_asup_rowptr (Output) int_t** - * Pointer to the upper part of A distributed on a 2D grid - * of processors, stored by rows. - * - * p_asup_colind (Output) int_t** - * Structure of of the upper part of A distributed on a - * 2D grid of processors, stored by rows. - * - * p_asup_val (Output) doublecomplex** - * Numerical values of the upper part of A, distributed on a - * 2D grid of processors, stored by rows. - * - * ilsum_i (Input) int_t * - * Starting position of each supernode in - * the full array (local, block row wise). - * - * ilsum_j (Input) int_t * - * Starting position of each supernode in - * the full array (local, block column wise). - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated when out of memory. - * (an approximation). - * - */ - int iam, p, procs; - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; - int_t nsupers, nsupers_i, nsupers_j; - int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ - int_t nnz_remote; /* number of remote nonzeros to be sent */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - doublecomplex *asup_val, *ainf_val; - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - doublecomplex *aij, **aij_send, *nzval, *dtemp; - doublecomplex *nzval_a; - MPI_Request *send_req; - MPI_Status status; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - float memAux; /* Memory used during this routine and freed on return */ - float memRet; /* Memory allocated and not freed on return */ - int_t iword, dword, szbuf; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter zdist_A()"); -#endif - iword = sizeof(int_t); - dword = sizeof(double); - - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - if (!(nnzToRecv = intCalloc_dist(2*procs))) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (ERROR_RET); - } - memAux = (float) (2 * procs * iword); - memRet = 0.; - nnzToSend = nnzToRecv + procs; - nsupers = supno[n-1] + 1; - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } - } - - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); - - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - szbuf = k; - - /* Allocate space for storing the triplets after redistribution. */ - if ( !(ia = intMalloc_dist(2*k)) ) { - fprintf (stderr, "Malloc fails for ia[]."); - return (memAux); - } - memAux += (float) (2*k*iword); - ja = ia + k; - if ( !(aij = doublecomplexMalloc_dist(k)) ) { - fprintf (stderr, "Malloc fails for aij[]."); - return (memAux); - } - memAux += (float) (k*dword); - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { - fprintf (stderr, "Malloc fails for send_req[]."); - return (memAux); - } - memAux += (float) (2*procs *sizeof(MPI_Request)); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ia_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(int_t*)); - if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) { - fprintf(stderr, "Malloc fails for aij_send[]."); - return (memAux); - } - memAux += (float) (procs*sizeof(doublecomplex*)); - if ( !(index = intMalloc_dist(2*SendCnt)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memAux); - } - memAux += (float) (2*SendCnt*iword); - if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) { - fprintf(stderr, "Malloc fails for nzval[]."); - return (memAux); - } - memAux += (float) (SendCnt * dword); - if ( !(ptr_to_send = intCalloc_dist(procs)) ) { - fprintf(stderr, "Malloc fails for ptr_to_send[]."); - return (memAux); - } - memAux += (float) (procs * iword); - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for itemp[]."); - return (memAux); - } - memAux += (float) (2*maxnnzToRecv*iword); - if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) { - fprintf(stderr, "Malloc fails for dtemp[]."); - return (memAux); - } - memAux += (float) (maxnnzToRecv * dword); - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { - fprintf (stderr, "Malloc fails for *ainf_colptr[]."); - return (memAux); - } - memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; - if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { - fprintf (stderr, "Malloc fails for *asup_rowptr[]."); - return (memAux+memRet); - } - memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nnz_loc_ainf = nnz_loc_asup = 0; - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - irow = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /* assert(jcol= gbj) { - ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; - nnz_loc_ainf ++; - } - else { - asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; - nnz_loc_asup ++; - } - } - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - MPI_Wait( &send_req[p], &status); - MPI_Wait( &send_req[procs+p], &status); - } - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - - SUPERLU_FREE(nnzToRecv); - memAux -= 2 * procs * iword; - if ( procs > 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - SUPERLU_FREE(ptr_to_send); - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + - procs*sizeof(doublecomplex*) + 2*SendCnt * iword + - SendCnt* dword + procs*iword + - 2*maxnnzToRecv*iword + maxnnzToRecv*dword; - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT. - ------------------------------------------------------------*/ - if (nnz_loc_ainf != 0) { - if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_rowind[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * iword); - if ( !(ainf_val = doublecomplexMalloc_dist(nnz_loc_ainf)) ) { - fprintf (stderr, "Malloc fails for *ainf_val[]."); - return (memAux+memRet); - } - memRet += (float) (nnz_loc_ainf * dword); - } - else { - ainf_rowind = NULL; - ainf_val = NULL; - } - if (nnz_loc_asup != 0) { - if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_colind[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * iword); - if ( !(asup_val = doublecomplexMalloc_dist(nnz_loc_asup)) ) { - fprintf (stderr, "Malloc fails for *asup_val[]."); - return (memAux + memRet); - } - memRet += (float) (nnz_loc_asup * dword); - } - else { - asup_colind = NULL; - asup_val = NULL; - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = ainf_colptr[0]; ainf_colptr[0] = 0; - for (j = 1; j < ilsum_j[nsupers_j]; j++) { - k += jsize; - jsize = ainf_colptr[j]; - ainf_colptr[j] = k; - } - ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; - i = 0; - isize = asup_rowptr[0]; asup_rowptr[0] = 0; - for (j = 1; j < ilsum_i[nsupers_i]; j++) { - i += isize; - isize = asup_rowptr[j]; - asup_rowptr[j] = i; - } - asup_rowptr[ilsum_i[nsupers_i]] = i + isize; - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - jcol = ja[i]; - irow = ia[i]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - /* Count nonzeros in each column of L / row of U */ - if (gbi >= gbj) { - j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); - k = ainf_colptr[j]; - ainf_rowind[k] = irow; - ainf_val[k] = aij[i]; - ainf_colptr[j] ++; - } - else { - j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); - k = asup_rowptr[j]; - asup_colind[k] = jcol; - asup_val[k] = aij[i]; - asup_rowptr[j] ++; - } - } - - /* Reset the column pointers to the beginning of each column */ - for (j = ilsum_j[nsupers_j]; j > 0; j--) - ainf_colptr[j] = ainf_colptr[j-1]; - for (j = ilsum_i[nsupers_i]; j > 0; j--) - asup_rowptr[j] = asup_rowptr[j-1]; - ainf_colptr[0] = 0; - asup_rowptr[0] = 0; - - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - memAux -= 2*szbuf*iword + szbuf*dword; - - *p_ainf_colptr = ainf_colptr; - *p_ainf_rowind = ainf_rowind; - *p_ainf_val = ainf_val; - *p_asup_rowptr = asup_rowptr; - *p_asup_colind = asup_colind; - *p_asup_val = asup_val; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit zdist_A()"); - fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); -#endif - - return (-memRet); -} /* dist_A */ - -int_t -zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Pslu_freeable_t *Pslu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * - * - * Purpose - * ======= - * Distribute the input matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * This routine should not be called for this case, an error - * is generated. Instead, pddistribute routine should be called. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (Input) int - * Dimension of the matrix. - * - * A (Input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. - * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. - * - * ScalePermstruct (Input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (Input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (Input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (Input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * < 0, number of bytes allocated on return from the dist_symbLU - * > 0, number of bytes allocated for performing the distribution - * of the data, when out of memory. - * (an approximation). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t Glu_freeable_n; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, - len, len1, nsupc, nsupc_gb, ii, nprocs; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - doublecomplex *a; - int_t *asub, *xa; - int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; - doublecomplex *asup_val, *ainf_val; - int_t *xsup, *supno; /* supernode and column mapping */ - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; - int_t next_ind; /* next available position in index[*] */ - int_t next_val; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - doublecomplex *lusup, *uval; /* nonzero values in L and U */ - int_t *recvBuf; - int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; - doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in - the full array (local, block column wise) */ - /*-- Auxiliary arrays; freed on return --*/ - int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *LUb_length; /* L,U block length; size nsupers_ij */ - int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ - int_t *LUb_number; /* global block number; size nsupers_ij */ - int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ - int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ - doublecomplex zero = {0.0, 0.0}; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; - float memStrLU, memA, - memDist = 0.; /* memory used for redistributing the data, which does - not include the memory for the numerical values of L and U */ - float memNLU = 0.; /* memory allocated for storing the numerical values of - L and U, that will be used in the numeric factorization */ - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif - - /* Initialization. */ - iam = grid->iam; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); -#endif - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nprocs = grid->npcol * grid->nprow; - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - Astore = (NRformat_loc *) A->Store; - - iword = sizeof(int_t); - dword = sizeof(doublecomplex); - - if (fact == SamePattern_SameRowPerm) { - ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); - } - - if ((memStrLU = - dist_symbLU (n, Pslu_freeable, - Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) - return (memStrLU); - memDist += (-memStrLU); - xsup = Glu_persist->xsup; /* supernode and column mapping */ - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ - nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ - nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); - if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { - fprintf (stderr, "Malloc fails for ilsum[]."); - return (memDist + memNLU); - } - memNLU += (nsupers_i+1) * iword; - if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { - fprintf (stderr, "Malloc fails for ilsum_j[]."); - return (memDist + memNLU); - } - memDist += (nsupers_j+1) * iword; - - /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ - ilsum[0] = 0; - ldaspa = 0; - for (gb = 0; gb < nsupers; gb++) - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - ilsum[nsupers_i] = ldaspa; - - ldaspa_j = 0; ilsum_j[0] = 0; - for (gb = 0; gb < nsupers; gb++) - if (mycol == PCOL( gb, grid )) { - i = SuperSize( gb ); - ldaspa_j += i; - lb = LBj( gb, grid ); - ilsum_j[lb + 1] = ilsum_j[lb] + i; - } - ilsum_j[nsupers_j] = ldaspa_j; - - if ((memA = zdist_A(A, ScalePermstruct, Glu_persist, - grid, &ainf_colptr, &ainf_rowind, &ainf_val, - &asup_rowptr, &asup_colind, &asup_val, - ilsum, ilsum_j)) > 0) - return (memDist + memA + memNLU); - memDist += (-memA); - - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ - - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - if ( !(ToRecv = intCalloc_dist(nsupers)) ) { - fprintf(stderr, "Calloc fails for ToRecv[]."); - return (memDist + memNLU); - } - memNLU += nsupers * iword; - - k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for ToSendR[]."); - return (memDist + memNLU); - } - memNLU += k*sizeof(int_t*); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) { - fprintf(stderr, "Malloc fails for index[]."); - return (memDist + memNLU); - } - memNLU += j*iword; - - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - - /* Auxiliary arrays used to set up L and U block data structures. - They are freed on return. */ - if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_length[]."); - return (memDist + memNLU); - } - if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Malloc fails for LUb_indptr[]."); - return (memDist + memNLU); - } - if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_number[]."); - return (memDist + memNLU); - } - if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { - fprintf(stderr, "Calloc fails for LUb_valptr[]."); - return (memDist + memNLU); - } - memDist += 4 * nsupers_ij * iword; - - k = CEILING( nsupers, grid->nprow ); - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (doublecomplex**)SUPERLU_MALLOC(nsupers_i * sizeof(doublecomplex*))) ) { - fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); - return (memDist + memNLU); - } - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*sizeof(doublecomplex*) + nsupers_i*sizeof(int_t*); - Unzval_br_ptr[nsupers_i-1] = NULL; - Ufstnz_br_ptr[nsupers_i-1] = NULL; - - if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Malloc fails for ToSendD[]."); - return (memDist + memNLU); - } - memNLU += nsupers_i*iword; - if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { - fprintf(stderr, "Calloc fails for rb_marker[]."); - return (memDist + memNLU); - } - memDist += (nsupers_i + nsupers_j)*iword; - - /* Auxiliary arrays used to set up L, U block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) - * sp_ienv_dist(3))) ) { - fprintf(stderr, "Calloc fails for SPA dense[]."); - return (memDist + memNLU); - } - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for fmod[]."); - return (memDist + memNLU); - } - if ( !(bmod = intCalloc_dist(nsupers_i)) ) { - fprintf(stderr, "Calloc fails for bmod[]."); - return (memDist + memNLU); - } - /* ------------------------------------------------ */ - memNLU += 2*nsupers_i*iword + - SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); - return (memDist + memNLU); - } - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); - return (memDist + memNLU); - } - memNLU += nsupers_j * sizeof(doublecomplex*) + nsupers_j * sizeof(int_t*); - Lnzval_bc_ptr[nsupers_j-1] = NULL; - Lrowind_bc_ptr[nsupers_j-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[]."); - return (memDist + memNLU); - } - len = nsupers_j * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for fsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[]."); - return (memDist + memNLU); - } - if ( !(index = intMalloc_dist(len)) ) { - fprintf(stderr, "Malloc fails for bsendx_plist[0]"); - return (memDist + memNLU); - } - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ - memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid); /* Local block number row wise */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - if ( myrow == jbrow ) { /* Block row jb in my process row */ - /* Scatter A into SPA. */ - for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { - for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { - if (i >= asup_rowptr[ilsum[nsupers_i]]) - printf ("ERR7\n"); - jcol = asup_colind[i]; - if (jcol >= n) - printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - gb = BlockNum( jcol ); - lb = LBj( gb, grid ); - if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); - jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); - if (jcol >= ldaspa_j) - printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n", - iam, jb, gb, j, jcol); - dense_col[jcol] = asup_val[i]; - } - dense_col += ldaspa_j; - } - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - /* Count number of blocks and length of each block. */ - nrbu = 0; - len = 0; /* Number of column subscripts I own. */ - len1 = 0; /* number of fstnz subscripts */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - if (i >= xusub[nsupers_i]) printf ("ERR10\n"); - jcol = usub[i]; - gb = BlockNum( jcol ); /* Global block number */ - - /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) - printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", - iam, jb, gb, jcol, jbcol, pc); */ - - lb = LBj( gb, grid ); /* Local block number */ - pc = PCOL( gb, grid ); /* Process col owning this block */ - if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; - /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ - pr = PROW( gb, grid ); - if ( pr != jbrow && mycol == pc) - bsendx_plist[lb][jbrow] = YES; - if (mycol == pc) { - len += nsupc; - LUb_length[lb] += nsupc; - ToSendD[ljb_i] = YES; - if (Urb_marker[lb] <= jb) { /* First see this block */ - if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; - Urb_marker[lb] = jb + 1; - LUb_number[nrbu] = gb; - /* if (gb == 391825 && jb == 145361) - printf ("Pe[%d] T1 [%d %d] nrbu %d \n", - iam, jb, gb, nrbu); */ - nrbu ++; - len1 += SuperSize( gb ); - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[ljb_i];/* Mod. count for back solve */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - } - } /* for i ... */ - - if ( nrbu ) { - /* Sort the blocks of U in increasing block column index. - SuperLU_DIST assumes this is true */ - /* simple insert sort algorithm */ - /* to be transformed in quick sort */ - for (j = 1; j < nrbu; j++) { - k = LUb_number[j]; - for (i=j-1; i>=0 && LUb_number[i] > k; i--) { - LUb_number[i+1] = LUb_number[i]; - } - LUb_number[i+1] = k; - } - - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) { - fprintf (stderr, "Malloc fails for Uindex[]"); - return (memDist + memNLU); - } - Ufstnz_br_ptr[ljb_i] = index; - if (!(Unzval_br_ptr[ljb_i] = - doublecomplexMalloc_dist(len))) { - fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); - return (memDist + memNLU); - } - memNLU += (len1+1)*iword + len*dword; - uval = Unzval_br_ptr[ljb_i]; - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = nrbu; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index */ - index[len1] = -1; /* End marker */ - next_ind = BR_HEADER; - next_val = 0; - for (k = 0; k < nrbu; k++) { - gb = LUb_number[k]; - lb = LBj( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; /* Reset vector of block length */ - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) - index[next_ind] = FstBlockC( jb + 1 ); - LUb_valptr[lb] = next_val; - next_val += len; - } - /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], - and the initial values of A from SPA into Unzval_br_ptr[]. */ - for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { - jcol = usub[i]; - gb = BlockNum( jcol ); - - if ( mycol == PCOL( gb, grid ) ) { - lb = LBj( gb, grid ); - k = LUb_indptr[lb]; /* Start fstnz in index */ - index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); - } - } /* for i ... */ - - for (i = 0; i < nrbu; i++) { - gb = LUb_number[i]; - lb = LBj( gb, grid ); - next_ind = LUb_indptr[lb]; - k = FstBlockC( jb + 1); - jcol = ilsum_j[lb]; - for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { - dense_col = dense; - j = index[next_ind+jj]; - for (ii = j; ii < k; ii++) { - uval[LUb_valptr[lb]++] = dense_col[jcol]; - dense_col[jcol] = zero; - dense_col += ldaspa_j; - } - } - } - } else { - Ufstnz_br_ptr[ljb_i] = NULL; - Unzval_br_ptr[ljb_i] = NULL; - } /* if nrbu ... */ - } /* if myrow == jbrow */ - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - if (mycol == jbcol) { /* Block column jb in my process column */ - /* Scatter A_inf into SPA. */ - for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { - for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { - irow = ainf_rowind[i]; - if (irow >= n) printf ("Pe[%d] ERR1\n", iam); - gb = BlockNum( irow ); - if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); - dense_col[irow] = ainf_val[i]; - } - } - dense_col += ldaspa; - } - - /* sort the indices of the diagonal block at the beginning of xlsub */ - if (myrow == jbrow) { - k = xlsub[ljb_j]; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - if (irow < nsupc + fsupc && i != k+irow-fsupc) { - lsub[i] = lsub[k + irow - fsupc]; - lsub[k + irow - fsupc] = irow; - i --; - } - } - } - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && - myrow == jbrow) { - fsendx_plist[ljb_j][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (Lrb_marker[lb] <= jb) { /* First see this block */ - Lrb_marker[lb] = jb + 1; - LUb_length[lb] = 1; - LUb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else - ++LUb_length[lb]; - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* If I am the owner of the diagonal block, order it first in LUb_number. - Necessary for SuperLU_DIST routines */ - kseen = EMPTY; - for (j = 0; j < nrbl; j++) { - if (LUb_number[j] == jb) - kseen = j; - } - if (kseen != EMPTY && kseen != 0) { - LUb_number[kseen] = LUb_number[0]; - LUb_number[0] = jb; - } - - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) { - fprintf (stderr, "Malloc fails for index[]"); - return (memDist + memNLU); - } - Lrowind_bc_ptr[ljb_j] = index; - if (!(Lnzval_bc_ptr[ljb_j] = - doublecomplexMalloc_dist(len*nsupc))) { - fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb); - return (memDist + memNLU); - } - memNLU += len1*iword + len*nsupc*dword; - - lusup = Lnzval_bc_ptr[ljb_j]; - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_ind = BC_HEADER; - next_val = 0; - for (k = 0; k < nrbl; ++k) { - gb = LUb_number[k]; - lb = LBi( gb, grid ); - len = LUb_length[lb]; - LUb_length[lb] = 0; - index[next_ind++] = gb; /* Descriptor */ - index[next_ind++] = len; - LUb_indptr[lb] = next_ind; - LUb_valptr[lb] = next_val; - next_ind += len; - next_val += len; - } - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - len = index[1]; /* LDA of lusup[] */ - for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = LUb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = LUb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb_j] = NULL; - Lnzval_bc_ptr[ljb_j] = NULL; - } /* if nrbl ... */ - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(ilsum_j); - SUPERLU_FREE(Urb_marker); - SUPERLU_FREE(LUb_length); - SUPERLU_FREE(LUb_indptr); - SUPERLU_FREE(LUb_number); - SUPERLU_FREE(LUb_valptr); - SUPERLU_FREE(Lrb_marker); - SUPERLU_FREE(dense); - - /* Free the memory used for storing L and U */ - SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); - if (lsub != NULL) - SUPERLU_FREE(lsub); - if (usub != NULL) - SUPERLU_FREE(usub); - - /* Free the memory used for storing A */ - SUPERLU_FREE(ainf_colptr); - if (ainf_rowind != NULL) { - SUPERLU_FREE(ainf_rowind); - SUPERLU_FREE(ainf_val); - } - SUPERLU_FREE(asup_rowptr); - if (asup_colind != NULL) { - SUPERLU_FREE(asup_colind); - SUPERLU_FREE(asup_val); - } - - /* exchange information about bsendx_plist in between column of processors */ - k = SUPERLU_MAX( grid->nprow, grid->npcol); - if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { - fprintf (stderr, "Malloc fails for recvBuf[]."); - return (memDist + memNLU); - } - if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for nnzToRecv[]."); - return (memDist + memNLU); - } - if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { - fprintf (stderr, "Malloc fails for ptrToRecv[]."); - return (memDist + memNLU); - } - - if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) - memDist = nsupers*k*iword +4*nprocs * sizeof(int); - - for (p = 0; p < nprocs; p++) - nnzToRecv[p] = 0; - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - nnzToRecv[p] += grid->npcol; - } - i = 0; - for (p = 0; p < nprocs; p++) { - ptrToRecv[p] = i; - i += nnzToRecv[p]; - ptrToSend[p] = 0; - if (p != iam) - nnzToSend[p] = nnzToRecv[iam]; - else - nnzToSend[p] = 0; - } - nnzToRecv[iam] = 0; - i = ptrToRecv[iam]; - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - if (p == iam) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - for (j = 0; j < grid->npcol; j++, i++) - recvBuf[i] = ToSendR[ljb_j][j]; - } - } - - MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, - recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); - - for (jb = 0; jb < nsupers; jb++) { - jbcol = PCOL( jb, grid ); - jbrow = PROW( jb, grid ); - p = PNUM(jbrow, jbcol, grid); - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - ljb_i = LBi( jb, grid ); /* Local block number row wise */ - /* (myrow == jbrow) { - if (ToSendD[ljb_i] == YES) - ToRecv[jb] = 1; - } - else { - if (recvBuf[ptrToRecv[p] + mycol] == YES) - ToRecv[jb] = 2; - } */ - if (recvBuf[ptrToRecv[p] + mycol] == YES) { - if (myrow == jbrow) - ToRecv[jb] = 1; - else - ToRecv[jb] = 2; - } - if (mycol == jbcol) { - for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) - ToSendR[ljb_j][i] = recvBuf[j]; - ToSendR[ljb_j][mycol] = EMPTY; - } - ptrToRecv[p] += grid->npcol; - } - - /* exchange information about bsendx_plist in between column of processors */ - MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, - MPI_MAX, grid->cscp.comm); - - for (jb = 0; jb < nsupers; jb ++) { - jbcol = PCOL( jb, grid); - jbrow = PROW( jb, grid); - if (mycol == jbcol) { - ljb_j = LBj( jb, grid ); /* Local block number column wise */ - if (myrow == jbrow ) { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { - (*bsendx_plist)[k] = recvBuf[k]; - if ((*bsendx_plist)[k] != EMPTY) - nbsendx ++; - } - } - else { - for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) - (*bsendx_plist)[k] = EMPTY; - } - } - } - - SUPERLU_FREE(nnzToRecv); - SUPERLU_FREE(ptrToRecv); - SUPERLU_FREE(nnzToSend); - SUPERLU_FREE(ptrToSend); - SUPERLU_FREE(recvBuf); - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - LUstruct->Glu_persist = Glu_persist; -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist, - ToRecv, ToSendR, ToSendD - */ - CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); -#endif - - return (- (memDist+memNLU)); -} /* dist_psymbtonum */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzutil.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,530 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -#include -#include "superlu_zdefs.h" - -/* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - */ -int pzCompRow_loc_to_CompCol_global -( - int_t need_value, /* Input. Whether need to gather numerical values */ - SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ - gridinfo_t *grid, /* Input */ - SuperMatrix *GA /* Output */ -) -{ - NRformat_loc *Astore; - NCformat *GAstore; - doublecomplex *a, *a_loc; - int_t *colind, *rowptr; - int_t *colptr_loc, *rowind_loc; - int_t m_loc, n, i, j, k, l; - int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz; - doublecomplex *a_recv; /* Buffer to receive the blocks of values. */ - doublecomplex *a_buf; /* Buffer to merge blocks into block columns. */ - int_t *colcnt, *itemp; - int_t *colptr_send; /* Buffer to redistribute the column pointers of the - local block rows. - Use n_loc+1 pointers for each block. */ - int_t *colptr_blk; /* The column pointers for each block, after - redistribution to the local block columns. - Use n_loc+1 pointers for each block. */ - int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ - int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ - int_t *fst_rows, *n_locs; - int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; - int it, n_loc, procs; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global"); -#endif - - /* Initialization. */ - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ - - /* ------------------------------------------------------------ - FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. - ------------------------------------------------------------*/ - zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, - &rowind_loc, &colptr_loc); - /* Change local row index numbers to global numbers. */ - for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; - -#if ( DEBUGlevel>=2 ) - printf("Proc %d\n", grid->iam); - PrintInt10("rowind_loc", nnz_loc, rowind_loc); - PrintInt10("colptr_loc", n+1, colptr_loc); -#endif - - procs = grid->nprow * grid->npcol; - if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) - ABORT("Malloc fails for fst_rows[]"); - n_locs = fst_rows + procs; - MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, - grid->comm); - for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; - n_locs[procs-1] = n - fst_rows[procs-1]; - if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) - ABORT("Malloc fails for recvcnts[]"); - sendcnts = recvcnts + procs; - rdispls = sendcnts + procs; - sdispls = rdispls + procs; - itemp_32 = sdispls + procs; - - /* All-to-all transfer column pointers of each block. - Now the matrix view is P-by-P block-partition. */ - /* n column starts for each column, and procs column ends for each block */ - if ( !(colptr_send = intMalloc_dist(n + procs)) ) - ABORT("Malloc fails for colptr_send[]"); - if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) - ABORT("Malloc fails for colptr_blk[]"); - for (i = 0, j = 0; i < procs; ++i) { - for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; - colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ - sendcnts[i] = n_locs[i] + 1; -#if ( DEBUGlevel>=1 ) - assert(j == fst_rows[i]); -#endif - sdispls[i] = j + i; - recvcnts[i] = n_loc + 1; - rdispls[i] = i * (n_loc + 1); - j += n_locs[i]; /* First column of next block in colptr_loc[] */ - } - MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, - colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); - - /* Adjust colptr_blk[] so that they contain the local indices of the - column pointers in the receive buffer. */ - nnz = 0; /* The running sum of the nonzeros counted by far */ - k = 0; - for (i = 0; i < procs; ++i) { - for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { - colnnz = colptr_blk[j+1] - colptr_blk[j]; - /*assert(k<=j);*/ - colptr_blk[k] = nnz; - nnz += colnnz; /* Start of the next column */ - ++k; - } - colptr_blk[k++] = nnz; /* Add an END marker for each block */ - } - /*assert(k == (n_loc+1)*procs);*/ - - /* Now prepare to transfer row indices and values. */ - sdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; - sdispls[i+1] = sdispls[i] + sendcnts[i]; - } - sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; - for (i = 0; i < procs; ++i) { - j = rdispls[i]; /* Point to this block in colptr_blk[]. */ - recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; - } - rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ - for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; - - k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ - if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) - ABORT("Malloc fails for rowind_recv[]"); - rowind_buf = rowind_recv + k; - MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, - rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); - if ( need_value ) { - if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) ) - ABORT("Malloc fails for rowind_recv[]"); - a_buf = a_recv + k; - MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX, - a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, - grid->comm); - } - - /* Reset colptr_loc[] to point to the n_loc global columns. */ - colptr_loc[0] = 0; - itemp = colptr_send; - for (j = 0; j < n_loc; ++j) { - colnnz = 0; - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1) + j; /* j-th column in i-th block */ - colnnz += colptr_blk[k+1] - colptr_blk[k]; - } - colptr_loc[j+1] = colptr_loc[j] + colnnz; - itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ - } - itemp[n_loc] = colptr_loc[n_loc]; - - /* Merge blocks of row indices into columns of row indices. */ - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) { /* i-th block */ - for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { - rowind_buf[itemp[j]] = rowind_recv[l]; - ++itemp[j]; - } - } - } - - if ( need_value ) { - for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) { /* i-th block */ - for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { - a_buf[itemp[j]] = a_recv[l]; - ++itemp[j]; - } - } - } - } - - /* ------------------------------------------------------------ - SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. - ------------------------------------------------------------*/ - GA->nrow = A->nrow; - GA->ncol = A->ncol; - GA->Stype = SLU_NC; - GA->Dtype = A->Dtype; - GA->Mtype = A->Mtype; - GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); - if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); - - /* First gather the size of each piece. */ - nnz_loc = colptr_loc[n_loc]; - MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); - for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; - GAstore->nnz = nnz; - - if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); - if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); - - /* Allgatherv for row indices. */ - rdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - rdispls[i+1] = rdispls[i] + itemp[i]; - itemp_32[i] = itemp[i]; - } - itemp_32[procs-1] = itemp[procs-1]; - it = nnz_loc; - MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, - itemp_32, rdispls, mpi_int_t, grid->comm); - if ( need_value ) { - if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); - MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, - itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); - } else GAstore->nzval = NULL; - - /* Now gather the column pointers. */ - rdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - rdispls[i+1] = rdispls[i] + n_locs[i]; - itemp_32[i] = n_locs[i]; - } - itemp_32[procs-1] = n_locs[procs-1]; - MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, - itemp_32, rdispls, mpi_int_t, grid->comm); - - /* Recompute column pointers. */ - for (i = 1; i < procs; ++i) { - k = rdispls[i]; - for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; - itemp[i] += itemp[i-1]; /* prefix sum */ - } - GAstore->colptr[n] = nnz; - -#if ( DEBUGlevel>=2 ) - if ( !grid->iam ) { - printf("After pdCompRow_loc_to_CompCol_global()\n"); - zPrint_CompCol_Matrix_dist(GA); - } -#endif - - SUPERLU_FREE(a_loc); - SUPERLU_FREE(rowind_loc); - SUPERLU_FREE(colptr_loc); - SUPERLU_FREE(fst_rows); - SUPERLU_FREE(recvcnts); - SUPERLU_FREE(colptr_send); - SUPERLU_FREE(colptr_blk); - SUPERLU_FREE(rowind_recv); - if ( need_value) SUPERLU_FREE(a_recv); -#if ( DEBUGlevel>=1 ) - if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat)); - CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global"); -#endif - return 0; -} /* pzCompRow_loc_to_CompCol_global */ - - -/* - * Permute the distributed dense matrix: B <= perm(X). - * perm[i] = j means the i-th row of X is in the j-th row of B. - */ -int pzPermute_Dense_Matrix -( - int_t fst_row, - int_t m_loc, - int_t row_to_proc[], - int_t perm[], - doublecomplex X[], int ldx, - doublecomplex B[], int ldb, - int nrhs, - gridinfo_t *grid -) -{ - int_t i, j, k, l; - int p, procs; - int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *send_ibuf, *recv_ibuf; - doublecomplex *send_dbuf, *recv_dbuf; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()"); -#endif - - procs = grid->nprow * grid->npcol; - if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) - ABORT("Malloc fails for sendcnts[]."); - sendcnts_nrhs = sendcnts + procs; - recvcnts = sendcnts_nrhs + procs; - recvcnts_nrhs = recvcnts + procs; - sdispls = recvcnts_nrhs + procs; - sdispls_nrhs = sdispls + procs; - rdispls = sdispls_nrhs + procs; - rdispls_nrhs = rdispls + procs; - ptr_to_ibuf = rdispls_nrhs + procs; - ptr_to_dbuf = ptr_to_ibuf + procs; - - for (i = 0; i < procs; ++i) sendcnts[i] = 0; - - /* Count the number of X entries to be sent to each process.*/ - for (i = fst_row; i < fst_row + m_loc; ++i) { - p = row_to_proc[perm[i]]; - ++sendcnts[p]; - } - MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); - sdispls[0] = rdispls[0] = 0; - sdispls_nrhs[0] = rdispls_nrhs[0] = 0; - sendcnts_nrhs[0] = sendcnts[0] * nrhs; - recvcnts_nrhs[0] = recvcnts[0] * nrhs; - for (i = 1; i < procs; ++i) { - sdispls[i] = sdispls[i-1] + sendcnts[i-1]; - sdispls_nrhs[i] = sdispls[i] * nrhs; - rdispls[i] = rdispls[i-1] + recvcnts[i-1]; - rdispls_nrhs[i] = rdispls[i] * nrhs; - sendcnts_nrhs[i] = sendcnts[i] * nrhs; - recvcnts_nrhs[i] = recvcnts[i] * nrhs; - } - k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ - l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ - /*assert(k == m_loc);*/ - /*assert(l == m_loc);*/ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - - for (i = 0; i < procs; ++i) { - ptr_to_ibuf[i] = sdispls[i]; - ptr_to_dbuf[i] = sdispls_nrhs[i]; - } - - /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ - for (i = fst_row; i < fst_row + m_loc; ++i) { - j = perm[i]; - p = row_to_proc[j]; - send_ibuf[ptr_to_ibuf[p]] = j; - j = ptr_to_dbuf[p]; - RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ - send_dbuf[j++] = X[i-fst_row + k*ldx]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; - } - - /* Transfer the (permuted) row indices and numerical values. */ - MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, - recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); - MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, - recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, - grid->comm); - - /* Copy the buffer into b. */ - for (i = 0, l = 0; i < m_loc; ++i) { - j = recv_ibuf[i] - fst_row; /* Relative row number */ - RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ - B[j + k*ldb] = recv_dbuf[l++]; - } - } - - SUPERLU_FREE(sendcnts); - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()"); -#endif - return 0; -} /* pzPermute_Dense_Matrix */ - - -/* - * Initialize the data structure for the solution phase. - */ -int zSolveInit(superlu_options_t *options, SuperMatrix *A, - int_t perm_r[], int_t perm_c[], int_t nrhs, - LUstruct_t *LUstruct, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) -{ - int_t *row_to_proc, *inv_perm_c, *itemp; - NRformat_loc *Astore; - int_t i, fst_row, m_loc, p; - int procs; - - Astore = (NRformat_loc *) A->Store; - fst_row = Astore->fst_row; - m_loc = Astore->m_loc; - procs = grid->nprow * grid->npcol; - - if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) - ABORT("Malloc fails for row_to_proc[]"); - SOLVEstruct->row_to_proc = row_to_proc; - if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) - ABORT("Malloc fails for inv_perm_c[]."); - for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; - SOLVEstruct->inv_perm_c = inv_perm_c; - - /* ------------------------------------------------------------ - EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. - SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. - - NOTE: For those processes that do not own any row, it must - must be set so that fst_row == A->nrow. - ------------------------------------------------------------*/ - if ( !(itemp = intMalloc_dist(procs+1)) ) - ABORT("Malloc fails for itemp[]"); - MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); - itemp[procs] = A->nrow; - for (p = 0; p < procs; ++p) { - for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; - } -#if ( DEBUGlevel>=2 ) - if ( !grid->iam ) { - printf("fst_row = %d\n", fst_row); - PrintInt10("row_to_proc", A->nrow, row_to_proc); - PrintInt10("inv_perm_c", A->ncol, inv_perm_c); - } -#endif - SUPERLU_FREE(itemp); - -#if 0 - /* Compute the mapping between rows and processes. */ - /* XSL NOTE: What happens if # of mapped processes is smaller - than total Procs? For the processes without any row, let - fst_row be EMPTY (-1). Make sure this case works! */ - MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); - itemp[procs] = n; - for (p = 0; p < procs; ++p) { - j = itemp[p]; - if ( j != EMPTY ) { - k = itemp[p+1]; - if ( k == EMPTY ) k = n; - for (i = j ; i < k; ++i) row_to_proc[i] = p; - } - } -#endif - - get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, - &SOLVEstruct->num_diag_procs, - &SOLVEstruct->diag_procs, - &SOLVEstruct->diag_len); - - if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) - ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, - LUstruct->Glu_persist, SOLVEstruct); - - if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *) - SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) ) - ABORT("Malloc fails for gsmv_comm[]"); - SOLVEstruct->A_colind_gsmv = NULL; - - options->SolveInitialized = YES; - return 0; -} /* zSolveInit */ - -/* - * Release the resources used for the solution phase. - */ -void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct) -{ - int_t *it; - pxgstrs_finalize(SOLVEstruct->gstrs_comm); - if ( options->RefineInitialized ) { - pzgsmv_finalize(SOLVEstruct->gsmv_comm); - options->RefineInitialized = NO; - } - SUPERLU_FREE(SOLVEstruct->gsmv_comm); - SUPERLU_FREE(SOLVEstruct->row_to_proc); - SUPERLU_FREE(SOLVEstruct->inv_perm_c); - SUPERLU_FREE(SOLVEstruct->diag_procs); - SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); - options->SolveInitialized = NO; -} /* zSolveFinalize */ - -/* - * Check the inf-norm of the error vector - */ -void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx, - doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) -{ - double err, xnorm, temperr, tempxnorm; - doublecomplex *x_work, *xtrue_work; - doublecomplex temp; - int i, j; - - for (j = 0; j < nrhs; j++) { - x_work = &x[j*ldx]; - xtrue_work = &xtrue[j*ldxtrue]; - err = xnorm = 0.0; - for (i = 0; i < n; i++) { - z_sub(&temp, &x_work[i], &xtrue_work[i]); - err = SUPERLU_MAX(err, z_abs(&temp)); - xnorm = SUPERLU_MAX(xnorm, z_abs(&x_work[i])); - } - - /* get the golbal max err & xnrom */ - temperr = err; - tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - - err = err / xnorm; - if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); - } -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/slamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/slamch.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/slamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/slamch.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,983 +0,0 @@ -#include -#include "Cnames.h" -#define TRUE_ (1) -#define FALSE_ (0) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define dabs(x) (double)abs(x) - -float slamch_(char *cmach) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMCH determines single precision machine parameters. - - Arguments - ========= - - CMACH (input) CHARACTER*1 - Specifies the value to be returned by SLAMCH: - = 'E' or 'e', SLAMCH := eps - = 'S' or 's , SLAMCH := sfmin - = 'B' or 'b', SLAMCH := base - = 'P' or 'p', SLAMCH := eps*base - = 'N' or 'n', SLAMCH := t - = 'R' or 'r', SLAMCH := rnd - = 'M' or 'm', SLAMCH := emin - = 'U' or 'u', SLAMCH := rmin - = 'L' or 'l', SLAMCH := emax - = 'O' or 'o', SLAMCH := rmax - - where - - eps = relative machine precision - sfmin = safe minimum, such that 1/sfmin does not overflow - base = base of the machine - prec = eps*base - t = number of (base) digits in the mantissa - rnd = 1.0 when rounding occurs in addition, 0.0 otherwise - emin = minimum exponent before (gradual) underflow - rmin = underflow threshold - base**(emin-1) - emax = largest exponent before overflow - rmax = overflow threshold - (base**emax)*(1-eps) - - ===================================================================== -*/ -/* >>Start of File<< - Initialized data */ - static int first = TRUE_; - /* System generated locals */ - int i__1; - float ret_val; - /* Builtin functions */ - double pow_ri(float *, int *); - /* Local variables */ - static float base; - static int beta; - static float emin, prec, emax; - static int imin, imax; - static int lrnd; - static float rmin, rmax, t, rmach; - extern int lsame_(char *, char *); - static float small, sfmin; - extern /* Subroutine */ int slamc2_(int *, int *, int *, float - *, int *, float *, int *, float *); - static int it; - static float rnd, eps; - - - - if (first) { - first = FALSE_; - slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); - base = (float) beta; - t = (float) it; - if (lrnd) { - rnd = 1.f; - i__1 = 1 - it; - eps = pow_ri(&base, &i__1) / 2; - } else { - rnd = 0.f; - i__1 = 1 - it; - eps = pow_ri(&base, &i__1); - } - prec = eps * base; - emin = (float) imin; - emax = (float) imax; - sfmin = rmin; - small = 1.f / rmax; - if (small >= sfmin) { - -/* Use SMALL plus a bit, to avoid the possibility of rou -nding - causing overflow when computing 1/sfmin. */ - - sfmin = small * (eps + 1.f); - } - } - - if (lsame_(cmach, "E")) { - rmach = eps; - } else if (lsame_(cmach, "S")) { - rmach = sfmin; - } else if (lsame_(cmach, "B")) { - rmach = base; - } else if (lsame_(cmach, "P")) { - rmach = prec; - } else if (lsame_(cmach, "N")) { - rmach = t; - } else if (lsame_(cmach, "R")) { - rmach = rnd; - } else if (lsame_(cmach, "M")) { - rmach = emin; - } else if (lsame_(cmach, "U")) { - rmach = rmin; - } else if (lsame_(cmach, "L")) { - rmach = emax; - } else if (lsame_(cmach, "O")) { - rmach = rmax; - } - - ret_val = rmach; - return ret_val; - -/* End of SLAMCH */ - -} /* slamch_ */ - - -/* Subroutine */ int slamc1_(int *beta, int *t, int *rnd, int - *ieee1) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC1 determines the machine parameters given by BETA, T, RND, and - IEEE1. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - IEEE1 (output) INT - Specifies whether rounding appears to be done in the IEEE - 'round to nearest' style. - - Further Details - =============== - - The routine is based on the routine ENVRON by Malcolm and - incorporates suggestions by Gentleman and Marovich. See - - Malcolm M. A. (1972) Algorithms to reveal properties of - floating-point arithmetic. Comms. of the ACM, 15, 949-951. - - Gentleman W. M. and Marovich S. B. (1974) More on algorithms - that reveal properties of floating point arithmetic units. - Comms. of the ACM, 17, 276-277. - - ===================================================================== -*/ - /* Initialized data */ - static int first = TRUE_; - /* System generated locals */ - float r__1, r__2; - /* Local variables */ - static int lrnd; - static float a, b, c, f; - static int lbeta; - static float savec; - static int lieee1; - static float t1, t2; - extern double slamc3_(float *, float *); - static int lt; - static float one, qtr; - - - - if (first) { - first = FALSE_; - one = 1.f; - -/* LBETA, LIEEE1, LT and LRND are the local values of BE -TA, - IEEE1, T and RND. - - Throughout this routine we use the function SLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - Compute a = 2.0**m with the smallest positive integer m s -uch - that - - fl( a + 1.0 ) = a. */ - - a = 1.f; - c = 1.f; - -/* + WHILE( C.EQ.ONE )LOOP */ -L10: - if (c == one) { - a *= 2; - c = slamc3_(&a, &one); - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - goto L10; - } -/* + END WHILE - - Now compute b = 2.0**m with the smallest positive integer -m - such that - - fl( a + b ) .gt. a. */ - - b = 1.f; - c = slamc3_(&a, &b); - -/* + WHILE( C.EQ.A )LOOP */ -L20: - if (c == a) { - b *= 2; - c = slamc3_(&a, &b); - goto L20; - } -/* + END WHILE - - Now compute the base. a and c are neighbouring floating po -int - numbers in the interval ( beta**t, beta**( t + 1 ) ) and - so - their difference is beta. Adding 0.25 to c is to ensure that - it - is truncated to beta and not ( beta - 1 ). */ - - qtr = one / 4; - savec = c; - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - lbeta = c + qtr; - -/* Now determine whether rounding or chopping occurs, by addin -g a - bit less than beta/2 and a bit more than beta/2 to - a. */ - - b = (float) lbeta; - r__1 = b / 2; - r__2 = -(double)b / 100; - f = slamc3_(&r__1, &r__2); - c = slamc3_(&f, &a); - if (c == a) { - lrnd = TRUE_; - } else { - lrnd = FALSE_; - } - r__1 = b / 2; - r__2 = b / 100; - f = slamc3_(&r__1, &r__2); - c = slamc3_(&f, &a); - if (lrnd && c == a) { - lrnd = FALSE_; - } - -/* Try and decide whether rounding is done in the IEEE 'round - to - nearest' style. B/2 is half a unit in the last place of the -two - numbers A and SAVEC. Furthermore, A is even, i.e. has last -bit - zero, and SAVEC is odd. Thus adding B/2 to A should not cha -nge - A, but adding B/2 to SAVEC should change SAVEC. */ - - r__1 = b / 2; - t1 = slamc3_(&r__1, &a); - r__1 = b / 2; - t2 = slamc3_(&r__1, &savec); - lieee1 = t1 == a && t2 > savec && lrnd; - -/* Now find the mantissa, t. It should be the integer part - of - log to the base beta of a, however it is safer to determine - t - by powering. So we find t as the smallest positive integer -for - which - - fl( beta**t + 1.0 ) = 1.0. */ - - lt = 0; - a = 1.f; - c = 1.f; - -/* + WHILE( C.EQ.ONE )LOOP */ -L30: - if (c == one) { - ++lt; - a *= lbeta; - c = slamc3_(&a, &one); - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - goto L30; - } -/* + END WHILE */ - - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *ieee1 = lieee1; - return 0; - -/* End of SLAMC1 */ - -} /* slamc1_ */ - - -/* Subroutine */ int slamc2_(int *beta, int *t, int *rnd, float * - eps, int *emin, float *rmin, int *emax, float *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC2 determines the machine parameters specified in its argument - list. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - EPS (output) FLOAT - The smallest positive number such that - - fl( 1.0 - EPS ) .LT. 1.0, - - where fl denotes the computed value. - - EMIN (output) INT - The minimum exponent before (gradual) underflow occurs. - - RMIN (output) FLOAT - The smallest normalized number for the machine, given by - BASE**( EMIN - 1 ), where BASE is the floating point value - - of BETA. - - EMAX (output) INT - The maximum exponent before overflow occurs. - - RMAX (output) FLOAT - The largest positive number for the machine, given by - BASE**EMAX * ( 1 - EPS ), where BASE is the floating point - - value of BETA. - - Further Details - =============== - - The computation of EPS is based on a routine PARANOIA by - W. Kahan of the University of California at Berkeley. - - ===================================================================== -*/ - /* Table of constant values */ - static int c__1 = 1; - - /* Initialized data */ - static int first = TRUE_; - static int iwarn = FALSE_; - /* System generated locals */ - int i__1; - float r__1, r__2, r__3, r__4, r__5; - /* Builtin functions */ - double pow_ri(float *, int *); - /* Local variables */ - static int ieee; - static float half; - static int lrnd; - static float leps, zero, a, b, c; - static int i, lbeta; - static float rbase; - static int lemin, lemax, gnmin; - static float small; - static int gpmin; - static float third, lrmin, lrmax, sixth; - static int lieee1; - extern /* Subroutine */ int slamc1_(int *, int *, int *, - int *); - extern double slamc3_(float *, float *); - extern /* Subroutine */ int slamc4_(int *, float *, int *), - slamc5_(int *, int *, int *, int *, int *, - float *); - static int lt, ngnmin, ngpmin; - static float one, two; - - - - if (first) { - first = FALSE_; - zero = 0.f; - one = 1.f; - two = 2.f; - -/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values - of - BETA, T, RND, EPS, EMIN and RMIN. - - Throughout this routine we use the function SLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. -*/ - - slamc1_(&lbeta, <, &lrnd, &lieee1); - -/* Start to find EPS. */ - - b = (float) lbeta; - i__1 = -lt; - a = pow_ri(&b, &i__1); - leps = a; - -/* Try some tricks to see whether or not this is the correct E -PS. */ - - b = two / 3; - half = one / 2; - r__1 = -(double)half; - sixth = slamc3_(&b, &r__1); - third = slamc3_(&sixth, &sixth); - r__1 = -(double)half; - b = slamc3_(&third, &r__1); - b = slamc3_(&b, &sixth); - b = dabs(b); - if (b < leps) { - b = leps; - } - - leps = 1.f; - -/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ -L10: - if (leps > b && b > zero) { - leps = b; - r__1 = half * leps; -/* Computing 5th power */ - r__3 = two, r__4 = r__3, r__3 *= r__3; -/* Computing 2nd power */ - r__5 = leps; - r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5); - c = slamc3_(&r__1, &r__2); - r__1 = -(double)c; - c = slamc3_(&half, &r__1); - b = slamc3_(&half, &c); - r__1 = -(double)b; - c = slamc3_(&half, &r__1); - b = slamc3_(&half, &c); - goto L10; - } -/* + END WHILE */ - - if (a < leps) { - leps = a; - } - -/* Computation of EPS complete. - - Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 -)). - Keep dividing A by BETA until (gradual) underflow occurs. T -his - is detected when we cannot recover the previous A. */ - - rbase = one / lbeta; - small = one; - for (i = 1; i <= 3; ++i) { - r__1 = small * rbase; - small = slamc3_(&r__1, &zero); -/* L20: */ - } - a = slamc3_(&one, &small); - slamc4_(&ngpmin, &one, &lbeta); - r__1 = -(double)one; - slamc4_(&ngnmin, &r__1, &lbeta); - slamc4_(&gpmin, &a, &lbeta); - r__1 = -(double)a; - slamc4_(&gnmin, &r__1, &lbeta); - ieee = FALSE_; - - if (ngpmin == ngnmin && gpmin == gnmin) { - if (ngpmin == gpmin) { - lemin = ngpmin; -/* ( Non twos-complement machines, no gradual under -flow; - e.g., VAX ) */ - } else if (gpmin - ngpmin == 3) { - lemin = ngpmin - 1 + lt; - ieee = TRUE_; -/* ( Non twos-complement machines, with gradual und -erflow; - e.g., IEEE standard followers ) */ - } else { - lemin = min(ngpmin,gpmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if (ngpmin == gpmin && ngnmin == gnmin) { - if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { - lemin = max(ngpmin,ngnmin); -/* ( Twos-complement machines, no gradual underflow -; - e.g., CYBER 205 ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) - { - if (gpmin - min(ngpmin,ngnmin) == 3) { - lemin = max(ngpmin,ngnmin) - 1 + lt; -/* ( Twos-complement machines with gradual underflo -w; - no known machine ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else { -/* Computing MIN */ - i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); - lemin = min(i__1,gnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } -/* ** - Comment out this if block if EMIN is ok */ - if (iwarn) { - first = TRUE_; - printf("\n\n WARNING. The value EMIN may be incorrect:- "); - printf("EMIN = %8i\n",lemin); - printf("If, after inspection, the value EMIN looks acceptable"); - printf("please comment out \n the IF block as marked within the"); - printf("code of routine SLAMC2, \n otherwise supply EMIN"); - printf("explicitly.\n"); - } -/* ** - - Assume IEEE arithmetic if we found denormalised numbers abo -ve, - or if arithmetic seems to round in the IEEE style, determi -ned - in routine SLAMC1. A true IEEE machine should have both thi -ngs - true; however, faulty machines may have one or the other. */ - - ieee = ieee || lieee1; - -/* Compute RMIN by successive division by BETA. We could comp -ute - RMIN as BASE**( EMIN - 1 ), but some machines underflow dur -ing - this computation. */ - - lrmin = 1.f; - i__1 = 1 - lemin; - for (i = 1; i <= 1-lemin; ++i) { - r__1 = lrmin * rbase; - lrmin = slamc3_(&r__1, &zero); -/* L30: */ - } - -/* Finally, call SLAMC5 to compute EMAX and RMAX. */ - - slamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *eps = leps; - *emin = lemin; - *rmin = lrmin; - *emax = lemax; - *rmax = lrmax; - - return 0; - - -/* End of SLAMC2 */ - -} /* slamc2_ */ - - -double slamc3_(float *a, float *b) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC3 is intended to force A and B to be stored prior to doing - - the addition of A and B , for use in situations where optimizers - - might hold one of these in a register. - - Arguments - ========= - - A, B (input) FLOAT - The values A and B. - - ===================================================================== -*/ -/* >>Start of File<< - System generated locals */ - float ret_val; - - - - ret_val = *a + *b; - - return ret_val; - -/* End of SLAMC3 */ - -} /* slamc3_ */ - - -/* Subroutine */ int slamc4_(int *emin, float *start, int *base) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC4 is a service routine for SLAMC2. - - Arguments - ========= - - EMIN (output) EMIN - The minimum exponent before (gradual) underflow, computed by - - setting A = START and dividing by BASE until the previous A - can not be recovered. - - START (input) FLOAT - The starting point for determining EMIN. - - BASE (input) INT - The base of the machine. - - ===================================================================== -*/ - /* System generated locals */ - int i__1; - float r__1; - /* Local variables */ - static float zero, a; - static int i; - static float rbase, b1, b2, c1, c2, d1, d2; - extern double slamc3_(float *, float *); - static float one; - - - - a = *start; - one = 1.f; - rbase = one / *base; - zero = 0.f; - *emin = 1; - r__1 = a * rbase; - b1 = slamc3_(&r__1, &zero); - c1 = a; - c2 = a; - d1 = a; - d2 = a; -/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. - $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ -L10: - if (c1 == a && c2 == a && d1 == a && d2 == a) { - --(*emin); - a = b1; - r__1 = a / *base; - b1 = slamc3_(&r__1, &zero); - r__1 = b1 * *base; - c1 = slamc3_(&r__1, &zero); - d1 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d1 += b1; -/* L20: */ - } - r__1 = a * rbase; - b2 = slamc3_(&r__1, &zero); - r__1 = b2 / rbase; - c2 = slamc3_(&r__1, &zero); - d2 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d2 += b2; -/* L30: */ - } - goto L10; - } -/* + END WHILE */ - - return 0; - -/* End of SLAMC4 */ - -} /* slamc4_ */ - - -/* Subroutine */ int slamc5_(int *beta, int *p, int *emin, - int *ieee, int *emax, float *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC5 attempts to compute RMAX, the largest machine floating-point - number, without overflow. It assumes that EMAX + abs(EMIN) sum - approximately to a power of 2. It will fail on machines where this - assumption does not hold, for example, the Cyber 205 (EMIN = -28625, - - EMAX = 28718). It will also fail if the value supplied for EMIN is - too large (i.e. too close to zero), probably with overflow. - - Arguments - ========= - - BETA (input) INT - The base of floating-point arithmetic. - - P (input) INT - The number of base BETA digits in the mantissa of a - floating-point value. - - EMIN (input) INT - The minimum exponent before (gradual) underflow. - - IEEE (input) INT - A logical flag specifying whether or not the arithmetic - system is thought to comply with the IEEE standard. - - EMAX (output) INT - The largest exponent before overflow - - RMAX (output) FLOAT - The largest machine floating-point number. - - ===================================================================== - - - - First compute LEXP and UEXP, two powers of 2 that bound - abs(EMIN). We then assume that EMAX + abs(EMIN) will sum - approximately to the bound that is closest to abs(EMIN). - (EMAX is the exponent of the required number RMAX). */ - /* Table of constant values */ - static float c_b5 = 0.f; - - /* System generated locals */ - int i__1; - float r__1; - /* Local variables */ - static int lexp; - static float oldy; - static int uexp, i; - static float y, z; - static int nbits; - extern double slamc3_(float *, float *); - static float recbas; - static int exbits, expsum, try__; - - - - lexp = 1; - exbits = 1; -L10: - try__ = lexp << 1; - if (try__ <= -(*emin)) { - lexp = try__; - ++exbits; - goto L10; - } - if (lexp == -(*emin)) { - uexp = lexp; - } else { - uexp = try__; - ++exbits; - } - -/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater - than or equal to EMIN. EXBITS is the number of bits needed to - store the exponent. */ - - if (uexp + *emin > -lexp - *emin) { - expsum = lexp << 1; - } else { - expsum = uexp << 1; - } - -/* EXPSUM is the exponent range, approximately equal to - EMAX - EMIN + 1 . */ - - *emax = expsum + *emin - 1; - nbits = exbits + 1 + *p; - -/* NBITS is the total number of bits needed to store a - floating-point number. */ - - if (nbits % 2 == 1 && *beta == 2) { - -/* Either there are an odd number of bits used to store a - floating-point number, which is unlikely, or some bits are - - not used in the representation of numbers, which is possible -, - (e.g. Cray machines) or the mantissa has an implicit bit, - (e.g. IEEE machines, Dec Vax machines), which is perhaps the - - most likely. We have to assume the last alternative. - If this is true, then we need to reduce EMAX by one because - - there must be some way of representing zero in an implicit-b -it - system. On machines like Cray, we are reducing EMAX by one - - unnecessarily. */ - - --(*emax); - } - - if (*ieee) { - -/* Assume we are on an IEEE machine which reserves one exponent - - for infinity and NaN. */ - - --(*emax); - } - -/* Now create RMAX, the largest machine number, which should - be equal to (1.0 - BETA**(-P)) * BETA**EMAX . - - First compute 1.0 - BETA**(-P), being careful that the - result is less than 1.0 . */ - - recbas = 1.f / *beta; - z = *beta - 1.f; - y = 0.f; - i__1 = *p; - for (i = 1; i <= *p; ++i) { - z *= recbas; - if (y < 1.f) { - oldy = y; - } - y = slamc3_(&y, &z); -/* L20: */ - } - if (y >= 1.f) { - y = oldy; - } - -/* Now multiply by BETA**EMAX to get RMAX. */ - - i__1 = *emax; - for (i = 1; i <= *emax; ++i) { - r__1 = y * *beta; - y = slamc3_(&r__1, &c_b5); -/* L30: */ - } - - *rmax = y; - return 0; - -/* End of SLAMC5 */ - -} /* slamc5_ */ - - -double pow_ri(float *ap, int *bp) -{ -double pow, x; -int n; - -pow = 1; -x = *ap; -n = *bp; - -if(n != 0) - { - if(n < 0) - { - n = -n; - x = 1/x; - } - for( ; ; ) - { - if(n & 01) - pow *= x; - if(n >>= 1) - x *= x; - else - break; - } - } -return(pow); -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_colorder.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_colorder.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_colorder.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_colorder.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,221 +0,0 @@ -#include "superlu_ddefs.h" - - -int check_perm_dist(char *, int_t, int_t *); - -void -sp_colorder(superlu_options_t *options, SuperMatrix *A, int_t *perm_c, - int_t *etree, SuperMatrix *AC) -{ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * - * sp_colorder() permutes the columns of the original matrix. It performs - * the following steps: - * - * 1. Apply column permutation perm_c[] to A's column pointers to form AC; - * - * 2. If options->Fact = DOFACT, then - * (1) Compute column elimination tree etree[] of AC'AC; - * (2) Post order etree[] to get a postordered elimination tree etree[], - * and a postorder permutation post[]; - * (3) Apply post[] permutation to columns of AC; - * (4) Overwrite perm_c[] with the product perm_c * post. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * Specifies whether or not the elimination tree will be re-used. - * If options->Fact == DOFACT, this means first time factor A, - * etree is computed and output. - * Otherwise, re-factor A, etree is input, unchanged on exit. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NCP; Dtype = SLU__D; Mtype = SLU_GE. - * In the future, more general A can be handled. - * - * perm_c (input/output) int* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->Fact == DOFACT, perm_c is both input and output. - * On output, it is changed according to a postorder of etree. - * Otherwise, perm_c is input. - * - * etree (input/output) int* - * Elimination tree of Pc*(A'+A)*Pc', dimension A->ncol. - * If options->Fact == DOFACT, etree is an output argument, - * otherwise it is an input argument. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * AC (output) SuperMatrix* - * The resulting matrix after applied the column permutation - * perm_c[] to matrix A. The type of AC can be: - * Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE. - * - */ - - NCformat *Astore; - NCPformat *ACstore; - int_t *iwork, *post; - register int_t n, i; -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter sp_colorder()"); -#endif - - n = A->ncol; - - /* Apply column permutation perm_c to A's column pointers so to - obtain NCP format in AC = A*Pc. */ - AC->Stype = SLU_NCP; - AC->Dtype = A->Dtype; - AC->Mtype = A->Mtype; - AC->nrow = A->nrow; - AC->ncol = A->ncol; - Astore = A->Store; - ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) ); - if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore"); - ACstore->nnz = Astore->nnz; - ACstore->nzval = Astore->nzval; - ACstore->rowind = Astore->rowind; - ACstore->colbeg = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t)); - if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg"); - ACstore->colend = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t)); - if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend"); - -#if ( DEBUGlevel>=3 ) - if ( !iam ) { - PrintInt10("pre_order:", n, perm_c); - check_perm_dist("Initial perm_c", n, perm_c); - } -#endif - - for (i = 0; i < n; i++) { - ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; - ACstore->colend[perm_c[i]] = Astore->colptr[i+1]; - } - - if ( options->Fact == DOFACT ) { - /* Factor A "from scratch" -- we also compute the etree, and - * make perm_c consistent with the postorder of the etree. - */ - - iwork = (int_t*) SUPERLU_MALLOC((n+1)*sizeof(int_t)); - if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]"); - - if ( A->nrow != A->ncol /* Rectangular matrix */ - || options->ColPerm == MMD_ATA ) { - /* Compute the column etree of A*Pc'. */ - sp_coletree_dist(ACstore->colbeg, ACstore->colend, ACstore->rowind, - A->nrow, A->ncol, etree); - } else { - /* Compute the etree of Pc*(A'+A)*Pc'. */ - int_t *b_colptr, *b_rowind, bnz, j; - int_t *c_colbeg, *c_colend; - - /* Form B = A + A'. */ - at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); - - /* Form C = Pc*B*Pc'. */ - c_colbeg = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t)); - c_colend = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t)); - if (!(c_colbeg) || !(c_colend) ) - ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend"); - for (i = 0; i < n; i++) { - c_colbeg[perm_c[i]] = b_colptr[i]; - c_colend[perm_c[i]] = b_colptr[i+1]; - } - for (j = 0; j < n; ++j) { - for (i = c_colbeg[j]; i < c_colend[j]; ++i) { - b_rowind[i] = perm_c[b_rowind[i]]; - } - } - - /* Compute etree of C. */ - sp_symetree_dist(c_colbeg, c_colend, b_rowind, n, etree); - - SUPERLU_FREE(b_colptr); - if ( bnz ) SUPERLU_FREE(b_rowind); - SUPERLU_FREE(c_colbeg); - SUPERLU_FREE(c_colend); - } -#if ( DEBUGlevel>=3 ) - if ( !iam ) PrintInt10("etree:", n, etree); -#endif - - /* Post order etree */ - post = (int_t *) TreePostorder_dist(n, etree); - /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; - iwork = post; */ - - /* Renumber etree in postorder */ - for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]]; - for (i = 0; i < n; ++i) etree[i] = iwork[i]; - -#if ( DEBUGlevel>=3 ) - if ( !iam ) PrintInt10("postorder etree:", n, etree); -#endif - - /* Postmultiply A*Pc by post[] */ - for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i]; - for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i]; - for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i]; - for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i]; - - for (i = 0; i < n; ++i) - iwork[i] = post[perm_c[i]]; /* product of perm_c and post */ - for (i = 0; i < n; ++i) perm_c[i] = iwork[i]; - -#if ( DEBUGlevel>=3 ) - if ( !iam ) { - PrintInt10("Pc*post:", n, perm_c); - check_perm_dist("final perm_c", n, perm_c); - } -#endif - - SUPERLU_FREE (post); - SUPERLU_FREE (iwork); - - } /* end if options->Fact == DOFACT ... */ - - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ACstore, ACstore->colbeg, ACstore->colend */ - CHECK_MALLOC(iam, "Exit sp_colorder()"); -#endif - -} /* SP_COLORDER */ - -int -check_perm_dist(char *what, int_t n, int_t *perm) -{ - register int_t i; - int_t *marker; - marker = (int_t *) intCalloc_dist(n); - - for (i = 0; i < n; ++i) { - if ( marker[perm[i]] == 1 || perm[i] >= n ) { - printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]); - ABORT("check_perm_dist"); - } else { - marker[perm[i]] = 1; - } - } - - SUPERLU_FREE(marker); - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_ienv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_ienv.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_ienv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_ienv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,71 +0,0 @@ -/* - * File name: sp_ienv.c - * History: Modified from lapack routine ILAENV - */ -#include "superlu_ddefs.h" -#include "machines.h" - -int_t -sp_ienv_dist(int_t ispec) -{ -/* - Purpose - ======= - - sp_ienv_dist() is inquired to choose machine-dependent parameters for the - local environment. See ISPEC for a description of the parameters. - - This version provides a set of parameters which should give good, - but not optimal, performance on many of the currently available - computers. Users are encouraged to modify this subroutine to set - the tuning parameters for their particular machine using the option - and problem size information in the arguments. - - Arguments - ========= - - ISPEC (input) int - Specifies the parameter to be returned as the value of SP_IENV_DIST. - = 1: the panel size w; a panel consists of w consecutive - columns of matrix A in the process of Gaussian elimination. - The best value depends on machine's cache characters. - = 2: the relaxation parameter relax; if the number of - nodes (columns) in a subtree of the elimination tree is less - than relax, this subtree is considered as one supernode, - regardless of the their row structures. - = 3: the maximum size for a supernode, which must be greater - than or equal to relaxation parameter (see case 2); - = 4: the minimum row dimension for 2-D blocking to be used; - = 5: the minimum column dimension for 2-D blocking to be used; - = 6: the estimated fills factor for the adjacency structures - of L and U, compared with A; - - (SP_IENV_DIST) (output) int - >= 0: the value of the parameter specified by ISPEC - < 0: if SP_IENV_DIST = -k, the k-th argument had an illegal value. - - ===================================================================== -*/ - int i; - - switch (ispec) { -#if ( MACH==CRAY_T3E ) - case 2: return (4); - case 3: return (30); -#elif ( MACH==IBM ) - case 2: return (10); - case 3: return (60); -#else - case 2: return (6); - case 3: return (50); -#endif - case 6: return (4); - } - - /* Invalid value for ISPEC */ - i = 1; - xerbla_("sp_ienv", &i); - return 0; - -} /* sp_ienv_dist */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,309 +0,0 @@ - - -/* - * -- Distributed SuperLU routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * - */ - -#ifndef __SUPERLU_dDEFS /* allow multiple inclusions */ -#define __SUPERLU_dDEFS - -/* - * File name: superlu_ddefs.h - * Purpose: Distributed SuperLU data types and function prototypes - * History: - */ - -#include "superlu_defs.h" - -/* - * On each processor, the blocks in L are stored in compressed block - * column format, the blocks in U are stored in compressed block row format. - */ -typedef struct { - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ -#if 0 - int_t *Lsub_buf; /* Buffer for the remote subscripts of L */ - double *Lval_buf; /* Buffer for the remote nonzeros of L */ -#endif - int_t *Lsub_buf_2[2]; /* Buffers for the remote subscripts of L*/ - double *Lval_buf_2[2]; /* Buffers for the remote nonzeros of L */ - int_t *Usub_buf; /* Buffer for the remote subscripts of U */ - double *Uval_buf; /* Buffer for the remote nonzeros of U */ - double *ujrow; /* used in panel factorization. */ - int_t bufmax[NBUFFERS]; /* Buffer size; 5 entries - * 0 : size of Lsub_buf[] - * 1 : size of Lval_buf[] - * 2 : size of Usub_buf[] - * 3 : size of Uval_buf[] - * 4 : size of tempv[LDA] - */ - - /*-- Record communication schedule for factorization. --*/ - int_t *ToRecv; /* Recv from no one (0), left (1), and up (2).*/ - int_t *ToSendD; /* Whether need to send down block row. */ - int_t **ToSendR; /* List of processes to send right block col. */ - - /*-- Record communication schedule for solves. --*/ - int_t *fmod; /* Modification count for L-solve */ - int_t **fsendx_plist; /* Column process list to send down Xk */ - int_t *frecv; /* Modifications to be recv'd in proc row */ - int_t nfrecvx; /* Number of Xk I will receive in L-solve */ - int_t nfsendx; /* Number of Xk I will send in L-solve */ - int_t *bmod; /* Modification count for U-solve */ - int_t **bsendx_plist; /* Column process list to send down Xk */ - int_t *brecv; /* Modifications to be recv'd in proc row */ - int_t nbrecvx; /* Number of Xk I will receive in U-solve */ - int_t nbsendx; /* Number of Xk I will send in U-solve */ - - /*-- Auxiliary arrays used for solves. --*/ - int_t *ilsum; /* Starting position of each supernode in lsum - (local) */ - int_t ldalsum; /* LDA of lsum (local) */ - int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */ - int_t SolveMsgVol; /* Volume of messages sent in the solve phase */ -} LocalLU_t; - -typedef struct { - int_t *etree; - Glu_persist_t *Glu_persist; - LocalLU_t *Llu; -} LUstruct_t; - -/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */ -typedef struct { - int_t lbnum; /* Row block number (local). */ - int_t indpos; /* Starting position in Uindex[]. */ -} Ucb_indptr_t; - -/*-- Data structure for communication during matrix-vector multiplication. */ -typedef struct { - int_t *extern_start; - int_t *ind_tosend; /* X indeices to be sent to other processes */ - int_t *ind_torecv; /* X indeices to be received from other processes */ - int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs) - (also point to val_torecv) */ - int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs) - (also point to val_tosend) */ - int *SendCounts; /* Numbers of X indices to be sent - (also numbers of X values to be received) */ - int *RecvCounts; /* Numbers of X indices to be received - (also numbers of X values to be sent) */ - double *val_tosend; /* X values to be sent to other processes */ - double *val_torecv; /* X values to be received from other processes */ - int_t TotalIndSend; /* Total number of indices to be sent - (also total number of values to be received) */ - int_t TotalValSend; /* Total number of values to be sent. - (also total number of indices to be received) */ -} pdgsmv_comm_t; - -/*-- Data structure for redistribution of B and X --*/ -typedef struct { - int *B_to_X_SendCnt; - int *X_to_B_SendCnt; - int *ptr_to_ibuf, *ptr_to_dbuf; -} pxgstrs_comm_t; - -/*-- Data structure holding the information for the solution phase --*/ -typedef struct { - int_t *row_to_proc; - int_t *inv_perm_c; - int_t num_diag_procs, *diag_procs, *diag_len; - pdgsmv_comm_t *gsmv_comm; - pxgstrs_comm_t *gstrs_comm; - int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column - indices of A are translated into the relative - positions in the gathered x-vector. - This is re-used in repeated calls to pdgsmv() */ -} SOLVEstruct_t; - - -/*********************************************************************** - * Function prototypes - ***********************************************************************/ - -#ifdef __cplusplus -extern "C" { -#endif - - -/* Supernodal LU factor related */ -extern void -dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, - int_t *, int_t *, Stype_t, Dtype_t, Mtype_t); -extern void -dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t, - int_t, double *, int_t *, int_t *, - Stype_t, Dtype_t, Mtype_t); -extern void -dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *, - double **, int_t **, int_t **); -extern int -pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *, - SuperMatrix *); -extern void -dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *); -extern void -dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t, - Stype_t, Dtype_t, Mtype_t); -extern void -dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, - int_t *, int_t *, int_t *, int_t *, int_t *, - Stype_t, Dtype_t, Mtype_t); -extern void -dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t, - double *, int_t); - -extern void dallocateA_dist (int_t, int_t, double **, int_t **, int_t **); -extern void dGenXtrue_dist (int_t, int_t, double *, int_t); -extern void dFillRHS_dist (char *, int_t, double *, int_t, - SuperMatrix *, double *, int_t); -extern int dcreate_matrix(SuperMatrix *, int, double **, int *, - double **, int *, FILE *, gridinfo_t *); - -/* Driver related */ -extern void dgsequ_dist (SuperMatrix *, double *, double *, double *, - double *, double *, int_t *); -extern double dlangs_dist (char *, SuperMatrix *); -extern void dlaqgs_dist (SuperMatrix *, double *, double *, double, - double, double, char *); -extern void pdgsequ (SuperMatrix *, double *, double *, double *, - double *, double *, int_t *, gridinfo_t *); -extern double pdlangs (char *, SuperMatrix *, gridinfo_t *); -extern void pdlaqgs (SuperMatrix *, double *, double *, double, - double, double, char *); -extern int pdPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[], - double [], int, double [], int, int, - gridinfo_t *); - -extern int sp_dtrsv_dist (char *, char *, char *, SuperMatrix *, - SuperMatrix *, double *, int *); -extern int sp_dgemv_dist (char *, double, SuperMatrix *, double *, - int, double, double *, int); -extern int sp_dgemm_dist (char *, char *, int, int, int, double, - SuperMatrix *, double *, int, double, - double *, int); - -extern int_t ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, - LUstruct_t *, gridinfo_t *); -extern void pdgssvx_ABglobal(superlu_options_t *, SuperMatrix *, - ScalePermstruct_t *, double *, - int, int, gridinfo_t *, LUstruct_t *, double *, - SuperLUStat_t *, int *); -extern int_t pddistribute(fact_t, int_t, SuperMatrix *, - ScalePermstruct_t *, Glu_freeable_t *, - LUstruct_t *, gridinfo_t *); -extern void pdgssvx(superlu_options_t *, SuperMatrix *, - ScalePermstruct_t *, double *, - int, int, gridinfo_t *, LUstruct_t *, - SOLVEstruct_t *, double *, SuperLUStat_t *, int *); -extern int dSolveInit(superlu_options_t *, SuperMatrix *, int_t [], int_t [], - int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *); -extern int_t pxgstrs_init(int_t, int_t, int_t, int_t, - int_t [], int_t [], gridinfo_t *grid, - Glu_persist_t *, SOLVEstruct_t *); -extern void pxgstrs_finalize(pxgstrs_comm_t *); -extern void dSolveFinalize(superlu_options_t *, SOLVEstruct_t *); -extern void dldperm(int_t, int_t, int_t, int_t [], int_t [], - double [], int_t *, double [], double []); -extern int_t pdgstrf(superlu_options_t *, int, int, double, - LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*); -extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *, - double *, int_t, int, SuperLUStat_t *, int *); -extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, - double *, int_t, int_t, int_t, int, SOLVEstruct_t *, - SuperLUStat_t *, int *); -extern void dlsum_fmod(double *, double *, double *, double *, - int, int, int_t , int_t *, int_t, int_t, int_t, - int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t *); -extern void dlsum_bmod(double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, - int_t **, int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t *); -extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *, - ScalePermstruct_t *, gridinfo_t *, - double [], int_t, double [], int_t, int, - SOLVEstruct_t *, double *, SuperLUStat_t *, int *); -extern void pdgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *, - gridinfo_t *, double *, int_t, double *, int_t, - int, double *, SuperLUStat_t *, int *); -extern int pdgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *, - gridinfo_t *, int_t *, int_t *[], - double *[], int_t *[], int_t []); -extern int pdgsmv_AXglobal(int_t, int_t [], double [], int_t [], - double [], double []); -extern int pdgsmv_AXglobal_abs(int_t, int_t [], double [], int_t [], - double [], double []); -extern void pdgsmv_init(SuperMatrix *, int_t *, gridinfo_t *, - pdgsmv_comm_t *); -extern void pdgsmv(int_t, SuperMatrix *, gridinfo_t *, pdgsmv_comm_t *, - double x[], double ax[]); -extern void pdgsmv_finalize(pdgsmv_comm_t *); - -/* Memory-related */ -extern double *doubleMalloc_dist(int_t); -extern double *doubleCalloc_dist(int_t); -extern void *duser_malloc_dist (int_t, int_t); -extern void duser_free_dist (int_t, int_t); -extern int_t dQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *, mem_usage_t *); -extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); -extern void LUstructInit(const int_t, const int_t, LUstruct_t *); -extern void LUstructFree(LUstruct_t *); - -/* Auxiliary routines */ -extern void dfill_dist (double *, int_t, double); -extern void dinf_norm_error_dist (int_t, int_t, double*, int_t, - double*, int_t, gridinfo_t*); -extern void pdinf_norm_error(int, int_t, int_t, double [], int_t, - double [], int_t , gridinfo_t *); -extern void dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, - double **, int_t **, int_t **); - -/* Distribute the data for numerical factorization */ -extern int_t ddist_psymbtonum -(fact_t, int_t, SuperMatrix *, - ScalePermstruct_t *, Pslu_freeable_t *, - LUstruct_t *, gridinfo_t *); - -/* Routines for debugging */ -extern void dPrintLblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *, - LocalLU_t *); -extern void dPrintUblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *, - LocalLU_t *); -extern void dPrint_CompCol_Matrix_dist(SuperMatrix *); -extern void dPrint_Dense_Matrix_dist(SuperMatrix *); -extern int dPrint_CompRowLoc_Matrix_dist(SuperMatrix *); -extern int file_PrintDouble5(FILE *, char *, int_t, double *); - -/* BLAS */ - -#ifdef USE_VENDOR_BLAS -extern int dgemm_(char*, char*, int*, int*, int*, double*, - double*, int*, double*, int*, double*, - double*, int*, int, int); -extern int dtrsv_(char*, char*, char*, int*, double*, int*, - double*, int*, int, int, int); -#else -extern int dgemm_(char*, char*, int*, int*, int*, double*, - double*, int*, double*, int*, double*, - double*, int*); -extern int dtrsv_(char*, char*, char*, int*, double*, int*, - double*, int*); -#endif - -extern int dger_(int*, int*, double*, double*, int*, - double*, int*, double*, int*); - - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_dDEFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_defs.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_defs.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_defs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_defs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,572 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 2.2) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * Feburary 20, 2008 - */ - -#ifndef __SUPERLU_DEFS /* allow multiple inclusions */ -#define __SUPERLU_DEFS - -/* - * File name: superlu_defs.h - * Purpose: Definitions which are precision-neutral - */ - -#include -#include -#ifdef _CRAY -#include -#include -#endif -#include -#include -#include - - -/* Define my integer size int_t */ -#ifdef _CRAY -typedef short int_t; -/*#undef int Revert back to int of default size. */ -#define mpi_int_t MPI_SHORT -#elif defined (_LONGINT) -typedef long int int_t; -#define mpi_int_t MPI_LONG -#else /* Default */ -typedef int int_t; -#define mpi_int_t MPI_INT -#endif - -/*********************************************************************** - * Enumerated types - ***********************************************************************/ -/*typedef enum {FALSE, TRUE} boolean_t;*/ -typedef enum {NO, YES} yes_no_t; -typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t; -typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t; -typedef enum {NATURAL, MMD_AT_PLUS_A, MMD_ATA, METIS_AT_PLUS_A, - PARMETIS, MY_PERMC} colperm_t; -typedef enum {NOTRANS, TRANS, CONJ} trans_t; -typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t; -typedef enum {NOREFINE, SINGLE=1, DOUBLE, EXTRA} IterRefine_t; -typedef enum {LUSUP, UCOL, LSUB, USUB} MemType; -typedef enum {HEAD, TAIL} stack_end_t; -typedef enum {SYSTEM, USER} LU_space_t; - -#include "Cnames.h" -#include "supermatrix.h" -#include "util_dist.h" -#include "psymbfact.h" - - -/*********************************************************************** - * Constants - ***********************************************************************/ -/* - * For each block column of L, the index[] array contains both the row - * subscripts and the integers describing the size of the blocks. - * The organization of index[] looks like: - * - * [ BLOCK COLUMN HEADER (size BC_HEADER) - * number of blocks - * number of row subscripts, i.e., LDA of nzval[] - * BLOCK 0 <---- - * BLOCK DESCRIPTOR (of size LB_DESCRIPTOR) | - * block number (global) | - * number of full rows in the block | - * actual row subscripts | - * BLOCK 1 | Repeat ... - * BLOCK DESCRIPTOR | number of blocks - * block number (global) | - * number of full rows in the block | - * actual row subscripts | - * . | - * . | - * . <---- - * ] - * - * For each block row of U, the organization of index[] looks like: - * - * [ BLOCK ROW HEADER (of size BR_HEADER) - * number of blocks - * number of entries in nzval[] - * number of entries in index[] - * BLOCK 0 <---- - * BLOCK DESCRIPTOR (of size UB_DESCRIPTOR) | - * block number (global) | - * number of nonzeros in the block | - * actual fstnz subscripts | - * BLOCK 1 | Repeat ... - * BLOCK DESCRIPTOR | number of blocks - * block number (global) | - * number of nonzeros in the block | - * actual fstnz subscripts | - * . | - * . | - * . <---- - * ] - * - */ -#define BC_HEADER 2 -#define LB_DESCRIPTOR 2 -#define BR_HEADER 3 -#define UB_DESCRIPTOR 2 -#define NBUFFERS 5 - -/* - * Communication tags - */ - /* For numeric factorization. */ -#define NTAGS 10000 -#define UjROW 10 -#define UkSUB 11 -#define UkVAL 12 -#define LkSUB 13 -#define LkVAL 14 -#define LkkDIAG 15 - /* For triangular solves. */ -#define XK_H 1 /* The header preceeding each X block. */ -#define LSUM_H 1 /* The header preceeding each MOD block. */ -#define GSUM 20 -#define Xk 21 -#define Yk 22 -#define LSUM 23 - -/* - * Communication scopes - */ -#define COMM_ALL 100 -#define COMM_COLUMN 101 -#define COMM_ROW 102 - -/* - * Matrix distribution for sparse matrix-vector multiplication - */ -#define SUPER_LINEAR 11 -#define SUPER_BLOCK 12 - -/* - * No of marker arrays used in the symbolic factorization, each of size n - */ -#define NO_MARKER 3 - - - -/*********************************************************************** - * Macros - ***********************************************************************/ -#define IAM(comm) { int rank; MPI_Comm_rank ( comm, &rank ); rank}; -#define MYROW(iam,grid) ( (iam) / grid->npcol ) -#define MYCOL(iam,grid) ( (iam) % grid->npcol ) -#define BlockNum(i) ( supno[i] ) -#define FstBlockC(bnum) ( xsup[bnum] ) -#define SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] ) -#define LBi(bnum,grid) ( (bnum)/grid->nprow )/* Global to local block rowwise */ -#define LBj(bnum,grid) ( (bnum)/grid->npcol )/* Global to local block columnwise*/ -#define PROW(bnum,grid) ( (bnum) % grid->nprow ) -#define PCOL(bnum,grid) ( (bnum) % grid->npcol ) -#define PNUM(i,j,grid) ( (i)*grid->npcol + j ) /* Process number at coord(i,j) */ -#define CEILING(a,b) ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) ) - /* For triangular solves */ -#define RHS_ITERATE(i) \ - for (i = 0; i < nrhs; ++i) -#define X_BLK(i) \ - ilsum[i] * nrhs + (i+1) * XK_H -#define LSUM_BLK(i) \ - ilsum[i] * nrhs + (i+1) * LSUM_H - -#define SuperLU_timer_ SuperLU_timer_dist_ -#define LOG2(x) (log10((double) x) / log10(2.0)) - - -#if ( VAMPIR>=1 ) -#define VT_TRACEON VT_traceon() -#define VT_TRACEOFF VT_traceoff() -#else -#define VT_TRACEON -#define VT_TRACEOFF -#endif - - -/*********************************************************************** - * New data types - ***********************************************************************/ - -/* - * Define the 2D mapping of matrix blocks to process grid. - * - * Process grid: - * Processes are numbered (0 : P-1). - * P = Pr x Pc, where Pr, Pc are the number of process rows and columns. - * (pr,pc) is the coordinate of IAM; 0 <= pr < Pr, 0 <= pc < Pc. - * - * Matrix blocks: - * Matrix is partitioned according to supernode partitions, both - * column and row-wise. - * The k-th block columns (rows) contains columns (rows) (s:t), where - * s=xsup[k], t=xsup[k+1]-1. - * Block A(I,J) contains - * rows from (xsup[I]:xsup[I+1]-1) and - * columns from (xsup[J]:xsup[J+1]-1) - * - * Mapping of matrix entry (i,j) to matrix block (I,J): - * (I,J) = ( supno[i], supno[j] ) - * - * Mapping of matrix block (I,J) to process grid (pr,pc): - * (pr,pc) = ( MOD(I,NPROW), MOD(J,NPCOL) ) - * - * (xsup[nsupers],supno[n]) are replicated on all processors. - * - */ - -/*-- Communication subgroup */ -typedef struct { - MPI_Comm comm; /* MPI communicator */ - int Np; /* number of processes */ - int Iam; /* my process number */ -} superlu_scope_t; - -/*-- Process grid definition */ -typedef struct { - MPI_Comm comm; /* MPI communicator */ - superlu_scope_t rscp; /* row scope */ - superlu_scope_t cscp; /* column scope */ - int iam; /* my process number in this scope */ - int_t nprow; /* number of process rows */ - int_t npcol; /* number of process columns */ -} gridinfo_t; - - -/* - *-- The structures are determined by SYMBFACT and used thereafter. - * - * (xsup,supno) describes mapping between supernode and column: - * xsup[s] is the leading column of the s-th supernode. - * supno[i] is the supernode no to which column i belongs; - * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12) - * xsup 0 1 2 4 7 12 - * Note: dfs will be performed on supernode rep. relative to the new - * row pivoting ordering - * - * This is allocated during symbolic factorization SYMBFACT. - */ -typedef struct { - int_t *xsup; - int_t *supno; -} Glu_persist_t; - -/* - *-- The structures are determined by SYMBFACT and used by DDISTRIBUTE. - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xlsub[j] points to the starting - * location of the j-th column in lsub[*]. Note that xlsub - * is indexed by column. - * Storage: original row subscripts - * - * During the course of sparse LU factorization, we also use - * (xlsub,lsub) for the purpose of symmetric pruning. For each - * supernode {s,s+1,...,t=s+r} with first column s and last - * column t, the subscript set - * lsub[j], j=xlsub[s], .., xlsub[s+1]-1 - * is the structure of column s (i.e. structure of this supernode). - * It is used for the storage of numerical values. - * Furthermore, - * lsub[j], j=xlsub[t], .., xlsub[t+1]-1 - * is the structure of the last column t of this supernode. - * It is for the purpose of symmetric pruning. Therefore, the - * structural subscripts can be rearranged without making physical - * interchanges among the numerical values. - * - * However, if the supernode has only one column, then we - * only keep one set of subscripts. For any subscript interchange - * performed, similar interchange must be done on the numerical - * values. - * - * The last column structures (for pruning) will be removed - * after the numercial LU factorization phase. - * - * (xusub,usub): xusub[i] points to the starting location of column i - * in usub[]. For each U-segment, only the row index of first nonzero - * is stored in usub[]. - * - * Each U column consists of a number of full segments. Each full segment - * starts from a leading nonzero, running up to the supernode (block) - * boundary. (Recall that the column-wise supernode partition is also - * imposed on the rows.) Because the segment is full, we don't store all - * the row indices. Instead, only the leading nonzero index is stored. - * The rest can be found together with xsup/supno pair. - * For example, - * usub[xsub[j+1]] - usub[xsub[j]] = number of segments in column j. - * for any i in usub[], - * supno[i] = block number in which i belongs to - * xsup[supno[i]+1] = first row of the next block - * The nonzeros of this segment are: - * i, i+1 ... xsup[supno[i]+1]-1 (only i is stored in usub[]) - * - */ -typedef struct { - int_t *lsub; /* compressed L subscripts */ - int_t *xlsub; - int_t *usub; /* compressed U subscripts */ - int_t *xusub; - int_t nzlmax; /* current max size of lsub */ - int_t nzumax; /* " " " usub */ - LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ -} Glu_freeable_t; - - -/* - *-- The structure used to store matrix A of the linear system and - * several vectors describing the transformations done to matrix A. - * - * A (SuperMatrix*) - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). - * The number of linear equations is A->nrow. The type of A can be: - * Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. - * - * DiagScale (DiagScale_t) - * Specifies the form of equilibration that was done. - * = NOEQUIL: No equilibration. - * = ROW: Row equilibration, i.e., A was premultiplied by diag(R). - * = COL: Column equilibration, i.e., A was postmultiplied by diag(C). - * = BOTH: Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * - * R double*, dimension (A->nrow) - * The row scale factors for A. - * If DiagScale = ROW or BOTH, A is multiplied on the left by diag(R). - * If DiagScale = NOEQUIL or COL, R is not defined. - * - * C double*, dimension (A->ncol) - * The column scale factors for A. - * If DiagScale = COL or BOTH, A is multiplied on the right by diag(C). - * If DiagScale = NOEQUIL or ROW, C is not defined. - * - * perm_r (int*) dimension (A->nrow) - * Row permutation vector which defines the permutation matrix Pr, - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * perm_c (int*) dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - */ -typedef struct { - DiagScale_t DiagScale; - double *R; - double *C; - int_t *perm_r; - int_t *perm_c; -} ScalePermstruct_t; - -/* - *-- This contains the options used to control the solve process. - * - * Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorizaed. - * = DOFACT: The matrix A will be factorized from scratch, and the - * factors will be stored in L and U. - * = SamePattern: The matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the column elimination tree - * LUstruct->etree. - * = SamePattern_SameRowPerm: The matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, and the - * both row and column permutation vectors perm_r and perm_c, - * distributed data structure set up from the previous symbolic - * factorization. - * = FACTORED: On entry, L, U, perm_r and perm_c contain the - * factored form of A. If DiagScale is not NOEQUIL, the matrix - * A has been equilibrated with scaling factors R and C. - * - * Equil (yes_no_t) - * Specifies whether to equilibrate the system (scale A's row and - * columns to have unit norm). - * - * ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: use the natural ordering - * = MMD_ATA: use minimum degree ordering on structure of A'*A - * = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A - * = MY_PERMC: use the ordering specified in ScalePermstruct->perm_c[] - * - * Trans (trans_t) - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A**T * X = B (Transpose) - * = CONJ: A**H * X = B (Transpose) - * - * IterRefine (IterRefine_t) - * Specifies whether to perform iterative refinement. - * = NO: no iterative refinement - * = WorkingPrec: perform iterative refinement in working precision - * = ExtraPrec: perform iterative refinement in extra precision - * - * PrintStat (yes_no_t) - * Specifies whether to print the solver's statistics. - * - * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU) - * Specifies the threshold used for a diagonal entry to be an - * acceptable pivot. - * - * RowPerm (rowperm_t) (only for SuperLU_DIST) - * Specifies whether to permute rows of the original matrix. - * = NO: not to permute the rows - * = LargeDiag: make the diagonal large relative to the off-diagonal - * = MY_PERMR: use the permutation given in ScalePermstruct->perm_r[] - * - * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST) - * Specifies whether to replace the tiny diagonals by - * sqrt(epsilon)*||A|| during LU factorization. - * - * SolveInitialized (yes_no_t) (only for SuperLU_DIST) - * Specifies whether the initialization has been performed to the - * triangular solve. - * - * RefineInitialized (yes_no_t) (only for SuperLU_DIST) - * Specifies whether the initialization has been performed to the - * sparse matrix-vector multiplication routine needed in iterative - * refinement. - */ -typedef struct { - fact_t Fact; - yes_no_t Equil; - yes_no_t ParSymbFact; - colperm_t ColPerm; - rowperm_t RowPerm; - double DiagPivotThresh; - IterRefine_t IterRefine; - trans_t Trans; - yes_no_t ReplaceTinyPivot; - yes_no_t SolveInitialized; - yes_no_t RefineInitialized; - yes_no_t PrintStat; -} superlu_options_t; - -typedef struct { - float for_lu; - float total; - int_t expansions; -} mem_usage_t; - - -/*********************************************************************** - * Function prototypes - ***********************************************************************/ - -#ifdef __cplusplus -extern "C" { -#endif - -extern void set_default_options_dist(superlu_options_t *); -extern void print_options_dist(superlu_options_t *); -extern void Destroy_CompCol_Matrix_dist(SuperMatrix *); -extern void Destroy_SuperNode_Matrix_dist(SuperMatrix *); -extern void Destroy_SuperMatrix_Store_dist(SuperMatrix *); -extern void Destroy_CompCol_Permuted_dist(SuperMatrix *); -extern void Destroy_CompRowLoc_Matrix_dist(SuperMatrix *); -extern void Destroy_CompRow_Matrix_dist(SuperMatrix *); -extern void sp_colorder (superlu_options_t*, SuperMatrix*, int_t*, int_t*, - SuperMatrix*); -extern int_t sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, - int_t *); -extern void countnz_dist (const int_t, int_t *, int_t *, int_t *, - Glu_persist_t *, Glu_freeable_t *); -extern int_t fixupL_dist (const int_t, const int_t *, Glu_persist_t *, - Glu_freeable_t *); -extern int_t *TreePostorder_dist (int_t, int_t *); -extern float slamch_(char *); -extern double dlamch_(char *); -extern void *superlu_malloc_dist (size_t); -extern void superlu_free_dist (void*); -extern int_t *intMalloc_dist (int_t); -extern int_t *intCalloc_dist (int_t); - -/* Auxiliary routines */ -extern double SuperLU_timer_ (); -extern void superlu_abort_and_exit_dist(char *); -extern int_t sp_ienv_dist (int_t); -extern int lsame_ (char *, char *); -extern int xerbla_ (char *, int *); -extern void ifill_dist (int_t *, int_t, int_t); -extern void super_stats_dist (int_t, int_t *); -extern void ScalePermstructInit(const int_t, const int_t, - ScalePermstruct_t *); -extern void ScalePermstructFree(ScalePermstruct_t *); -extern void superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *); -extern void superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t, - gridinfo_t *); -extern void superlu_gridexit(gridinfo_t *); -extern void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *); -extern void a_plus_at_dist(const int_t, const int_t, int_t *, int_t *, - int_t *, int_t **, int_t **); -extern void bcast_tree(void *, int, MPI_Datatype, int, int, - gridinfo_t *, int, int *); -extern int_t symbfact(superlu_options_t *, int, SuperMatrix *, int_t *, - int_t *, Glu_persist_t *, Glu_freeable_t *); -extern int_t symbfact_SubInit(fact_t, void *, int_t, int_t, int_t, int_t, - Glu_persist_t *, Glu_freeable_t *); -extern int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *, - Glu_freeable_t *); -extern int_t symbfact_SubFree(Glu_freeable_t *); -extern void get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *, - int_t **, int_t **); -extern int_t QuerySpace_dist(int_t, int_t, Glu_freeable_t *, mem_usage_t *); -extern int xerbla_ (char *, int *); -extern void pxerbla (char *, gridinfo_t *, int_t); -extern void PStatInit(SuperLUStat_t *); -extern void PStatFree(SuperLUStat_t *); -extern void PStatPrint(superlu_options_t *, SuperLUStat_t *, gridinfo_t *); - - -/* Prototypes for parallel symbolic factorization */ -extern float symbfact_dist -(int, int, SuperMatrix *, int_t *, int_t *, int_t *, int_t *, - Pslu_freeable_t *, MPI_Comm *, MPI_Comm *, mem_usage_t *); - -/* Get the column permutation using parmetis */ -extern float get_perm_c_parmetis -(SuperMatrix *, int_t *, int_t *, int, int, - int_t **, int_t **, gridinfo_t *, MPI_Comm *); - -/* Auxiliary routines for memory expansions used during - the parallel symbolic factorization routine */ - -extern int_t psymbfact_LUXpandMem -(int_t, int_t, int_t, int_t, int_t, int_t, int_t, int_t, - Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -extern int_t psymbfact_LUXpand -(int_t, int_t, int_t, int_t, int_t *, int_t, int_t, int_t, int_t, - Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -extern int_t psymbfact_LUXpand_RL -(int_t, int_t, int_t, int_t, int_t, int_t, - Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *); - -extern int_t psymbfact_prLUXpand -(int_t, int_t, - MemType, Llu_symbfact_t *, psymbfact_stat_t *); - -/* Routines for debugging */ -extern void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *); -extern void check_repfnz_dist(int_t, int_t, int_t, int_t *); -extern int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *); -extern void PrintDouble5(char *, int_t, double *); -extern void PrintInt10(char *, int_t, int_t *); -extern int file_PrintInt10(FILE *, char *, int_t, int_t *); - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_DEFS */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_grid.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_grid.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_grid.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_grid.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,169 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_ddefs.h" - -/* Define global variables */ -MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX; - -/* - * All processes in the MPI communicator must call this routine. - */ -void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which - the new grid is formed. */ - int_t nprow, int_t npcol, gridinfo_t *grid) -{ - int Np = nprow * npcol; - int_t *usermap; - int i, j, info; - - /* Make a list of the processes in the new communicator. */ - usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t)); - for (j = 0; j < npcol; ++j) - for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j; - - /* Check MPI environment initialization. */ - MPI_Initialized( &info ); - if ( !info ) - ABORT("C main program must explicitly call MPI_Init()"); - - MPI_Comm_size( Bcomm, &info ); - if ( info < Np ) - ABORT("Number of processes is smaller than NPROW * NPCOL"); - - superlu_gridmap(Bcomm, nprow, npcol, usermap, nprow, grid); - - SUPERLU_FREE(usermap); -} - - -/* - * All processes in the MPI communicator must call this routine. - */ -void superlu_gridmap( - MPI_Comm Bcomm, /* The base communicator upon which - the new grid is formed. */ - int_t nprow, - int_t npcol, - int_t usermap[], /* usermap(i,j) holds the process - number to be placed in {i,j} of - the process grid. */ - int_t ldumap, /* The leading dimension of the - 2D array usermap[]. */ - gridinfo_t *grid) -{ - MPI_Group mpi_base_group, superlu_grp; - int Np = nprow * npcol, mycol, myrow; - int *pranks; - int i, j, info; - static int first_init = 1; - - /* Create datatype in C for MPI complex. */ - if ( first_init ) { - MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX ); - MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX ); - first_init = 0; - } - - /* Check MPI environment initialization. */ - MPI_Initialized( &info ); - if ( !info ) - ABORT("C main program must explicitly call MPI_Init()"); - - grid->nprow = nprow; - grid->npcol = npcol; - - /* Make a list of the processes in the new communicator. */ - pranks = (int *) SUPERLU_MALLOC(Np*sizeof(int)); - for (j = 0; j < npcol; ++j) - for (i = 0; i < nprow; ++i) - pranks[i*npcol+j] = usermap[j*ldumap+i]; - - /* - * Form MPI communicator for all. - */ - /* Get the group underlying Bcomm. */ - MPI_Comm_group( Bcomm, &mpi_base_group ); - /* Create the new group. */ - MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); - /* Create the new communicator. */ - /* NOTE: The call is to be executed by all processes in Bcomm, - even if they do not belong in the new group -- superlu_grp. */ - MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); - - /* Bail out if I am not in the group, superlu_group. */ - if ( grid->comm == MPI_COMM_NULL ) { - grid->comm = Bcomm; - MPI_Comm_rank( Bcomm, &i ); - grid->iam = i; - /*grid->iam = -1;*/ - SUPERLU_FREE(pranks); - return; - } - - MPI_Comm_rank( grid->comm, &(grid->iam) ); - myrow = grid->iam / npcol; - mycol = grid->iam % npcol; - - /* - * Form MPI communicator for myrow, scope = COMM_ROW. - */ -#if 0 - for (i = 0; i < npcol; ++i) pranks[i] = myrow*npcol + i; - MPI_Comm_group( grid->comm, &superlu_grp ); /* Find all's group */ - MPI_Group_incl( superlu_grp, npcol, pranks, &grp ); /* Form new group */ - MPI_Comm_create( grid->comm, grp, &grid->rscp.comm );/* Create new comm */ -#else - MPI_Comm_split(grid->comm, myrow, mycol, &(grid->rscp.comm)); -#endif - - /* - * Form MPI communicator for mycol, scope = COMM_COLUMN. - */ -#if 0 - for (i = 0; i < nprow; ++i) pranks[i] = i*npcol + mycol; - MPI_Group_incl( superlu_grp, nprow, pranks, &grp ); /* Form new group */ - MPI_Comm_create( grid->comm, grp, &grid->cscp.comm );/* Create new comm */ -#else - MPI_Comm_split(grid->comm, mycol, myrow, &(grid->cscp.comm)); -#endif - - grid->rscp.Np = npcol; - grid->rscp.Iam = mycol; - grid->cscp.Np = nprow; - grid->cscp.Iam = myrow; - -#if 0 - { - int tag_ub; - if ( !grid->iam ) { - MPI_Attr_get(Bcomm, MPI_TAG_UB, &tag_ub, &info); - printf("MPI_TAG_UB %d\n", tag_ub); - /* returns 4295677672 - In reality it is restricted to no greater than 16384. */ - } - exit(0); - } -#endif - - SUPERLU_FREE(pranks); - MPI_Group_free(&superlu_grp); - MPI_Group_free(&mpi_base_group); -} - -void superlu_gridexit(gridinfo_t *grid) -{ - if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) { - /* Marks the communicator objects for deallocation. */ - MPI_Comm_free( &grid->rscp.comm ); - MPI_Comm_free( &grid->cscp.comm ); - MPI_Comm_free( &grid->comm ); - } - if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) { - MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX ); - } -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_timer.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_timer.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_timer.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_timer.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,67 +0,0 @@ -/* - * Purpose - * ======= - * Returns the time in seconds used by the process. - * - * Note: the timer function call is machine dependent. Use conditional - * compilation to choose the appropriate function. - * - */ - -#include "superlu_defs.h" - -#ifdef SUN -/* - * It uses the system call gethrtime(3C), which is accurate to - * nanoseconds. -*/ -#include - -double SuperLU_timer_() { - return ( (double)gethrtime() / 1e9 ); -} - -#elif defined ( UNIX_TIMER ) - -#include -#include -#include -#include - -#ifndef CLK_TCK -#define CLK_TCK 60 -#endif - -double SuperLU_timer_() -{ - struct tms use; - double tmp; - times(&use); - tmp = use.tms_utime; - tmp += use.tms_stime; - return (double)(tmp) / (double) CLK_TCK; -} - -#elif _WIN32 - -#include - -double SuperLU_timer_() -{ - clock_t t; - t=clock(); - - return ((double)t)/CLOCKS_PER_SEC; -} - -#else - -#include - -double SuperLU_timer_() -{ - return MPI_Wtime(); -} - -#endif - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,312 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.1) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * November 1, 2007 - * - */ - -#ifndef __SUPERLU_zDEFS /* allow multiple inclusions */ -#define __SUPERLU_zDEFS - -/* - * File name: superlu_zdefs.h - * Purpose: Distributed SuperLU data types and function prototypes - * History: - */ - -#include "superlu_defs.h" -#include "dcomplex.h" - -/* - * On each processor, the blocks in L are stored in compressed block - * column format, the blocks in U are stored in compressed block row format. - */ -typedef struct { - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ -#if 0 - int_t *Lsub_buf; /* Buffer for the remote subscripts of L */ - double *Lval_buf; /* Buffer for the remote nonzeros of L */ -#endif - int_t *Lsub_buf_2[2]; /* Buffers for the remote subscripts of L*/ - doublecomplex *Lval_buf_2[2]; /* Buffers for the remote nonzeros of L */ - int_t *Usub_buf; /* Buffer for the remote subscripts of U */ - doublecomplex *Uval_buf; /* Buffer for the remote nonzeros of U */ - doublecomplex *ujrow; /* used in panel factorization. */ - int_t bufmax[NBUFFERS]; /* Buffer size; 5 entries - * 0 : size of Lsub_buf[] - * 1 : size of Lval_buf[] - * 2 : size of Usub_buf[] - * 3 : size of Uval_buf[] - * 4 : size of tempv[LDA] - */ - - /*-- Record communication schedule for factorization. --*/ - int_t *ToRecv; /* Recv from no one (0), left (1), and up (2).*/ - int_t *ToSendD; /* Whether need to send down block row. */ - int_t **ToSendR; /* List of processes to send right block col. */ - - /*-- Record communication schedule for solves. --*/ - int_t *fmod; /* Modification count for L-solve */ - int_t **fsendx_plist; /* Column process list to send down Xk */ - int_t *frecv; /* Modifications to be recv'd in proc row */ - int_t nfrecvx; /* Number of Xk I will receive in L-solve */ - int_t nfsendx; /* Number of Xk I will send in L-solve */ - int_t *bmod; /* Modification count for U-solve */ - int_t **bsendx_plist; /* Column process list to send down Xk */ - int_t *brecv; /* Modifications to be recv'd in proc row */ - int_t nbrecvx; /* Number of Xk I will receive in U-solve */ - int_t nbsendx; /* Number of Xk I will send in U-solve */ - - /*-- Auxiliary arrays used for solves. --*/ - int_t *ilsum; /* Starting position of each supernode in lsum - (local) */ - int_t ldalsum; /* LDA of lsum (local) */ - int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */ - int_t SolveMsgVol; /* Volume of messages sent in the solve phase */ -} LocalLU_t; - -typedef struct { - int_t *etree; - Glu_persist_t *Glu_persist; - LocalLU_t *Llu; -} LUstruct_t; - -/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */ -typedef struct { - int_t lbnum; /* Row block number (local). */ - int_t indpos; /* Starting position in Uindex[]. */ -} Ucb_indptr_t; - -/*-- Data structure for communication during matrix-vector multiplication. */ -typedef struct { - int_t *extern_start; - int_t *ind_tosend; /* X indeices to be sent to other processes */ - int_t *ind_torecv; /* X indeices to be received from other processes */ - int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs) - (also point to val_torecv) */ - int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs) - (also point to val_tosend) */ - int *SendCounts; /* Numbers of X indices to be sent - (also numbers of X values to be received) */ - int *RecvCounts; /* Numbers of X indices to be received - (also numbers of X values to be sent) */ - doublecomplex *val_tosend; /* X values to be sent to other processes */ - doublecomplex *val_torecv; /* X values to be received from other processes */ - int_t TotalIndSend; /* Total number of indices to be sent - (also total number of values to be received) */ - int_t TotalValSend; /* Total number of values to be sent. - (also total number of indices to be received) */ -} pzgsmv_comm_t; - -/*-- Data structure for redistribution of B and X --*/ -typedef struct { - int *B_to_X_SendCnt; - int *X_to_B_SendCnt; - int *ptr_to_ibuf, *ptr_to_dbuf; -} pxgstrs_comm_t; - -/*-- Data structure holding the information for the solution phase --*/ -typedef struct { - int_t *row_to_proc; - int_t *inv_perm_c; - int_t num_diag_procs, *diag_procs, *diag_len; - pzgsmv_comm_t *gsmv_comm; - pxgstrs_comm_t *gstrs_comm; - int_t *A_colind_gsmv; /* After pzgsmv_init(), the global column - indices of A are translated into the relative - positions in the gathered x-vector. - This is re-used in repeated calls to pzgsmv() */ -} SOLVEstruct_t; - - -/*********************************************************************** - * Function prototypes - ***********************************************************************/ - -#ifdef __cplusplus -extern "C" { -#endif - - -/* Supernodal LU factor related */ -extern void -zCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, - int_t *, int_t *, Stype_t, Dtype_t, Mtype_t); -extern void -zCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t, - int_t, doublecomplex *, int_t *, int_t *, - Stype_t, Dtype_t, Mtype_t); -extern void -zCompRow_to_CompCol_dist(int_t, int_t, int_t, doublecomplex *, int_t *, int_t *, - doublecomplex **, int_t **, int_t **); -extern int -pzCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *, - SuperMatrix *); -extern void -zCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *); -extern void -zCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, doublecomplex *, int_t, - Stype_t, Dtype_t, Mtype_t); -extern void -zCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, - int_t *, int_t *, int_t *, int_t *, int_t *, - Stype_t, Dtype_t, Mtype_t); -extern void -zCopy_Dense_Matrix_dist(int_t, int_t, doublecomplex *, int_t, - doublecomplex *, int_t); - -extern void zallocateA_dist (int_t, int_t, doublecomplex **, int_t **, int_t **); -extern void zGenXtrue_dist (int_t, int_t, doublecomplex *, int_t); -extern void zFillRHS_dist (char *, int_t, doublecomplex *, int_t, - SuperMatrix *, doublecomplex *, int_t); -extern int zcreate_matrix(SuperMatrix *, int, doublecomplex **, int *, - doublecomplex **, int *, FILE *, gridinfo_t *); - -/* Driver related */ -extern void zgsequ_dist (SuperMatrix *, double *, double *, double *, - double *, double *, int_t *); -extern double zlangs_dist (char *, SuperMatrix *); -extern void zlaqgs_dist (SuperMatrix *, double *, double *, double, - double, double, char *); -extern void pzgsequ (SuperMatrix *, double *, double *, double *, - double *, double *, int_t *, gridinfo_t *); -extern double pzlangs (char *, SuperMatrix *, gridinfo_t *); -extern void pzlaqgs (SuperMatrix *, double *, double *, double, - double, double, char *); -extern int pzPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[], - doublecomplex [], int, doublecomplex [], int, int, - gridinfo_t *); - -extern int sp_ztrsv_dist (char *, char *, char *, SuperMatrix *, - SuperMatrix *, doublecomplex *, int *); -extern int sp_zgemv_dist (char *, doublecomplex, SuperMatrix *, doublecomplex *, - int, doublecomplex, doublecomplex *, int); -extern int sp_zgemm_dist (char *, char *, int, int, int, doublecomplex, - SuperMatrix *, doublecomplex *, int, doublecomplex, - doublecomplex *, int); - -extern int_t zdistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, - LUstruct_t *, gridinfo_t *); -extern void pzgssvx_ABglobal(superlu_options_t *, SuperMatrix *, - ScalePermstruct_t *, doublecomplex *, - int, int, gridinfo_t *, LUstruct_t *, double *, - SuperLUStat_t *, int *); -extern int_t pzdistribute(fact_t, int_t, SuperMatrix *, - ScalePermstruct_t *, Glu_freeable_t *, - LUstruct_t *, gridinfo_t *); -extern void pzgssvx(superlu_options_t *, SuperMatrix *, - ScalePermstruct_t *, doublecomplex *, - int, int, gridinfo_t *, LUstruct_t *, - SOLVEstruct_t *, double *, SuperLUStat_t *, int *); -extern int zSolveInit(superlu_options_t *, SuperMatrix *, int_t [], int_t [], - int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *); -extern int_t pxgstrs_init(int_t, int_t, int_t, int_t, - int_t [], int_t [], gridinfo_t *grid, - Glu_persist_t *, SOLVEstruct_t *); -extern void pxgstrs_finalize(pxgstrs_comm_t *); -extern void zSolveFinalize(superlu_options_t *, SOLVEstruct_t *); -extern void zldperm(int_t, int_t, int_t, int_t [], int_t [], - doublecomplex [], int_t *, double [], double []); -extern int_t pzgstrf(superlu_options_t *, int, int, double, - LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*); -extern void pzgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *, - doublecomplex *, int_t, int, SuperLUStat_t *, int *); -extern void pzgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, - doublecomplex *, int_t, int_t, int_t, int, SOLVEstruct_t *, - SuperLUStat_t *, int *); -extern void zlsum_fmod(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, - int, int, int_t , int_t *, int_t, int_t, int_t, - int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t *); -extern void zlsum_bmod(doublecomplex *, doublecomplex *, doublecomplex *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, - int_t **, int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t *); -extern void pzgsrfs(int_t, SuperMatrix *, double, LUstruct_t *, - ScalePermstruct_t *, gridinfo_t *, - doublecomplex [], int_t, doublecomplex [], int_t, int, - SOLVEstruct_t *, double *, SuperLUStat_t *, int *); -extern void pzgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *, - gridinfo_t *, doublecomplex *, int_t, doublecomplex *, int_t, - int, double *, SuperLUStat_t *, int *); -extern int pzgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *, - gridinfo_t *, int_t *, int_t *[], - doublecomplex *[], int_t *[], int_t []); -extern int pzgsmv_AXglobal(int_t, int_t [], doublecomplex [], int_t [], - doublecomplex [], doublecomplex []); -extern int pzgsmv_AXglobal_abs(int_t, int_t [], doublecomplex [], int_t [], - doublecomplex [], double []); -extern void pzgsmv_init(SuperMatrix *, int_t *, gridinfo_t *, - pzgsmv_comm_t *); -extern void pzgsmv(int_t, SuperMatrix *, gridinfo_t *, pzgsmv_comm_t *, - doublecomplex x[], doublecomplex ax[]); -extern void pzgsmv_finalize(pzgsmv_comm_t *); - -/* Memory-related */ -extern doublecomplex *doublecomplexMalloc_dist(int_t); -extern doublecomplex *doublecomplexCalloc_dist(int_t); -extern double *doubleMalloc_dist(int_t); -extern double *doubleCalloc_dist(int_t); -extern void *duser_malloc_dist (int_t, int_t); -extern void duser_free_dist (int_t, int_t); -extern int_t zQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *, mem_usage_t *); -extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); -extern void LUstructInit(const int_t, const int_t, LUstruct_t *); -extern void LUstructFree(LUstruct_t *); - -/* Auxiliary routines */ -extern void zfill_dist (doublecomplex *, int_t, doublecomplex); -extern void zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t, - doublecomplex*, int_t, gridinfo_t*); -extern void pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t, - doublecomplex [], int_t , gridinfo_t *); -extern void zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, - doublecomplex **, int_t **, int_t **); - -/* Distribute the data for numerical factorization */ -extern int_t zdist_psymbtonum -(fact_t, int_t, SuperMatrix *, - ScalePermstruct_t *, Pslu_freeable_t *, - LUstruct_t *, gridinfo_t *); - -/* Routines for debugging */ -extern void zPrintLblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *, - LocalLU_t *); -extern void zPrintUblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *, - LocalLU_t *); -extern void zPrint_CompCol_Matrix_dist(SuperMatrix *); -extern void zPrint_Dense_Matrix_dist(SuperMatrix *); -extern int zPrint_CompRowLoc_Matrix_dist(SuperMatrix *); -extern void PrintDoublecomplex(char *, int_t, doublecomplex *); -extern int file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *); - -/* BLAS */ - -#ifdef USE_VENDOR_BLAS -extern int zgemm_(char*, char*, int*, int*, int*, doublecomplex*, - doublecomplex*, int*, doublecomplex*, int*, doublecomplex*, - doublecomplex*, int*, int, int); -extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*, - doublecomplex*, int*, int, int, int); -#else -extern int zgemm_(char*, char*, int*, int*, int*, doublecomplex*, - doublecomplex*, int*, doublecomplex*, int*, doublecomplex*, - doublecomplex*, int*); -extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*, - doublecomplex*, int*); -#endif - -extern int zger_(int*, int*, doublecomplex*, doublecomplex*, int*, - doublecomplex*, int*, doublecomplex*, int*); - - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_dDEFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,177 +0,0 @@ -#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */ -#define __SUPERLU_SUPERMATRIX - - -/******************************************** - * The matrix types are defined as follows. * - ********************************************/ -typedef enum { - SLU_NC, /* column-wise, no supernode */ - SLU_NCP, /* column-wise, column-permuted, no supernode - (The consecutive columns of nonzeros, after permutation, - may not be stored contiguously.) */ - SLU_NR, /* row-wize, no supernode */ - SLU_SC, /* column-wise, supernode */ - SLU_SCP, /* supernode, column-wise, permuted */ - SLU_SR, /* row-wise, supernode */ - SLU_DN, /* Fortran style column-wise storage for dense matrix */ - SLU_NR_loc /* distributed compressed row format */ -} Stype_t; - -typedef enum { - SLU_S, /* single */ - SLU_D, /* double */ - SLU_C, /* single complex */ - SLU_Z /* double complex */ -} Dtype_t; - -typedef enum { - SLU_GE, /* general */ - SLU_TRLU, /* lower triangular, unit diagonal */ - SLU_TRUU, /* upper triangular, unit diagonal */ - SLU_TRL, /* lower triangular */ - SLU_TRU, /* upper triangular */ - SLU_SYL, /* symmetric, store lower half */ - SLU_SYU, /* symmetric, store upper half */ - SLU_HEL, /* Hermitian, store lower half */ - SLU_HEU /* Hermitian, store upper half */ -} Mtype_t; - -typedef struct { - Stype_t Stype; /* Storage type: interprets the storage structure - pointed to by *Store. */ - Dtype_t Dtype; /* Data type. */ - Mtype_t Mtype; /* Matrix type: describes the mathematical property of - the matrix. */ - int_t nrow; /* number of rows */ - int_t ncol; /* number of columns */ - void *Store; /* pointer to the actual storage of the matrix */ -} SuperMatrix; - -/*********************************************** - * The storage schemes are defined as follows. * - ***********************************************/ - -/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *rowind; /* pointer to array of row indices of the nonzeros */ - int_t *colptr; /* pointer to array of beginning of columns in nzval[] - and rowind[] */ - /* Note: - Zero-based indexing is used; - colptr[] has ncol+1 entries, the last one pointing - beyond the last column, so that colptr[ncol] = nnz. */ -} NCformat; - -/* Stype == SLU_NR */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by raw */ - int_t *colind; /* pointer to array of columns indices of the nonzeros */ - int_t *rowptr; /* pointer to array of beginning of rows in nzval[] - and colind[] */ - /* Note: - Zero-based indexing is used; - rowptr[] has nrow+1 entries, the last one pointing - beyond the last row, so that rowptr[nrow] = nnz. */ -} NRformat; - -/* Stype == SLU_SC */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - int_t nsuper; /* number of supernodes, minus 1 */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */ - int_t *rowind; /* pointer to array of compressed row indices of - rectangular supernodes */ - int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */ - int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column - j belongs; mapping from column to supernode number. */ - int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th - supernode; mapping from supernode number to column. - e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) - sup_to_col: 0 1 2 4 7 12 (nsuper=4) */ - /* Note: - Zero-based indexing is used; - nzval_colptr[], rowind_colptr[], col_to_sup and - sup_to_col[] have ncol+1 entries, the last one - pointing beyond the last column. - For col_to_sup[], only the first ncol entries are - defined. For sup_to_col[], only the first nsuper+2 - entries are defined. */ -} SCformat; - -/* Stype == SLU_SCP */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - int_t nsuper; /* number of supernodes */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *nzval_colbeg;/* nzval_colbeg[j] points to beginning of column j - in nzval[] */ - int_t *nzval_colend;/* nzval_colend[j] points to one past the last element - of column j in nzval[] */ - int_t *rowind; /* pointer to array of compressed row indices of - rectangular supernodes */ - int_t *rowind_colbeg;/* rowind_colbeg[j] points to beginning of column j - in rowind[] */ - int_t *rowind_colend;/* rowind_colend[j] points to one past the last element - of column j in rowind[] */ - int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column - j belongs; mapping from column to supernode. */ - int_t *sup_to_colbeg; /* sup_to_colbeg[s] points to the start of the s-th - supernode; mapping from supernode to column.*/ - int_t *sup_to_colend; /* sup_to_colend[s] points to one past the end of the - s-th supernode; mapping from supernode number to - column. - e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) - sup_to_colbeg: 0 1 2 4 7 (nsuper=4) - sup_to_colend: 1 2 4 7 12 */ - /* Note: - Zero-based indexing is used; - nzval_colptr[], rowind_colptr[], col_to_sup and - sup_to_col[] have ncol+1 entries, the last one - pointing beyond the last column. */ -} SCPformat; - -/* Stype == SLU_NCP */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *rowind;/* pointer to array of row indices of the nonzeros */ - /* Note: nzval[]/rowind[] always have the same length */ - int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] - and rowind[] */ - int_t *colend;/* colend[j] points to one past the last element of column - j in nzval[] and rowind[] */ - /* Note: - Zero-based indexing is used; - The consecutive columns of the nonzeros may not be - contiguous in storage, because the matrix has been - postmultiplied by a column permutation matrix. */ -} NCPformat; - -/* Stype == SLU_DN */ -typedef struct { - int_t lda; /* leading dimension */ - void *nzval; /* array of size lda*ncol to represent a dense matrix */ -} DNformat; - -/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */ -typedef struct { - int_t nnz_loc; /* number of nonzeros in the local submatrix */ - int_t m_loc; /* number of rows local to this processor */ - int_t fst_row; /* global index of the first row */ - void *nzval; /* pointer to array of nonzero values, packed by row */ - int_t *rowptr; /* pointer to array of beginning of rows in nzval[] - and colind[] */ - int_t *colind; /* pointer to array of column indices of the nonzeros */ - /* Note: - Zero-based indexing is used; - rowptr[] has n_loc + 1 entries, the last one pointing - beyond the last row, so that rowptr[n_loc] = nnz_loc.*/ -} NRformat_loc; - - -#endif /* __SUPERLU_SUPERMATRIX */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old 1970-01-01 00:00:00.000000000 +0000 @@ -1,144 +0,0 @@ -#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */ -#define __SUPERLU_SUPERMATRIX - - -/******************************************** - * The matrix types are defined as follows. * - ********************************************/ -typedef enum { - SLU_NC, /* column-wise, no supernode */ - SLU_NR, /* row-wize, no supernode */ - SLU_SC, /* column-wise, supernode */ - SLU_SR, /* row-wise, supernode */ - SLU_NCP, /* column-wise, column-permuted, no supernode - (The consecutive columns of nonzeros, after permutation, - may not be stored contiguously.) */ - SLU_DN, /* Fortran style column-wise storage for dense matrix */ - SLU_NR_loc /* distributed compressed row format */ -} Stype_t; - -typedef enum { - SLU_S, /* single */ - SLU_D, /* double */ - SLU_C, /* single complex */ - SLU_Z /* double complex */ -} Dtype_t; - -typedef enum { - SLU_GE, /* general */ - SLU_TRLU, /* lower triangular, unit diagonal */ - SLU_TRUU, /* upper triangular, unit diagonal */ - SLU_TRL, /* lower triangular */ - SLU_TRU, /* upper triangular */ - SLU_SYL, /* symmetric, store lower half */ - SLU_SYU, /* symmetric, store upper half */ - SLU_HEL, /* Hermitian, store lower half */ - SLU_HEU /* Hermitian, store upper half */ -} Mtype_t; - -typedef struct { - Stype_t Stype; /* Storage type: interprets the storage structure - pointed to by *Store. */ - Dtype_t Dtype; /* Data type. */ - Mtype_t Mtype; /* Matrix type: describes the mathematical property of - the matrix. */ - int_t nrow; /* number of rows */ - int_t ncol; /* number of columns */ - void *Store; /* pointer to the actual storage of the matrix */ -} SuperMatrix; - -/*********************************************** - * The storage schemes are defined as follows. * - ***********************************************/ - -/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *rowind; /* pointer to array of row indices of the nonzeros */ - int_t *colptr; /* pointer to array of beginning of columns in nzval[] - and rowind[] */ - /* Note: - Zero-based indexing is used; - colptr[] has ncol+1 entries, the last one pointing - beyond the last column, so that colptr[ncol] = nnz. */ -} NCformat; - -/* Stype == SLU_NR */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by raw */ - int_t *colind; /* pointer to array of columns indices of the nonzeros */ - int_t *rowptr; /* pointer to array of beginning of rows in nzval[] - and colind[] */ - /* Note: - Zero-based indexing is used; - rowptr[] has nrow+1 entries, the last one pointing - beyond the last row, so that rowptr[nrow] = nnz. */ -} NRformat; - -/* Stype == SLU_SC */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - int_t nsuper; /* number of supernodes, minus 1 */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */ - int_t *rowind; /* pointer to array of compressed row indices of - rectangular supernodes */ - int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */ - int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column - j belongs; mapping from column to supernode number. */ - int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th - supernode; mapping from supernode number to column. - e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) - sup_to_col: 0 1 2 4 7 12 (nsuper=4) */ - /* Note: - Zero-based indexing is used; - nzval_colptr[], rowind_colptr[], col_to_sup and - sup_to_col[] have ncol+1 entries, the last one - pointing beyond the last column. - For col_to_sup[], only the first ncol entries are - defined. For sup_to_col[], only the first nsuper+2 - entries are defined. */ -} SCformat; - -/* Stype == SLU_NCP */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *rowind;/* pointer to array of row indices of the nonzeros */ - /* Note: nzval[]/rowind[] always have the same length */ - int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] - and rowind[] */ - int_t *colend;/* colend[j] points to one past the last element of column - j in nzval[] and rowind[] */ - /* Note: - Zero-based indexing is used; - The consecutive columns of the nonzeros may not be - contiguous in storage, because the matrix has been - postmultiplied by a column permutation matrix. */ -} NCPformat; - -/* Stype == SLU_DN */ -typedef struct { - int_t lda; /* leading dimension */ - void *nzval; /* array of size lda*ncol to represent a dense matrix */ -} DNformat; - -/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */ -typedef struct { - int_t nnz_loc; /* number of nonzeros in the local submatrix */ - int_t m_loc; /* number of rows local to this processor */ - int_t fst_row; /* global index of the first row */ - void *nzval; /* pointer to array of nonzero values, packed by row */ - int_t *rowptr; /* pointer to array of beginning of rows in nzval[] - and colind[] */ - int_t *colind; /* pointer to array of column indices of the nonzeros */ - /* Note: - Zero-based indexing is used; - rowptr[] has n_loc + 1 entries, the last one pointing - beyond the last row, so that rowptr[n_loc] = nnz_loc.*/ -} NRformat_loc; - - -#endif /* __SUPERLU_SUPERMATRIX */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/symbfact.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/symbfact.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/symbfact.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/symbfact.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,863 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ -/* - * Modified by X. S. Li. - */ - -#include "superlu_ddefs.h" - -/* What type of supernodes we want */ -#define T2_SUPER - - -/* - * Internal protypes - */ -static void relax_snode(int_t, int_t *, int_t, int_t *, int_t *); -static int_t snode_dfs(SuperMatrix *, const int_t, const int_t, int_t *, - int_t *, Glu_persist_t *, Glu_freeable_t *); -static int_t column_dfs(SuperMatrix *, const int_t, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, int_t *, int_t *, - Glu_persist_t *, Glu_freeable_t *); -static int_t pivotL(const int_t, int_t *, int_t *, - Glu_persist_t *, Glu_freeable_t *); -static int_t set_usub(const int_t, const int_t, const int_t, int_t *, int_t *, - Glu_persist_t *, Glu_freeable_t *); -static void pruneL(const int_t, const int_t *, const int_t, const int_t, - const int_t *, const int_t *, int_t *, - Glu_persist_t *, Glu_freeable_t *); - - -/************************************************************************/ -int_t symbfact -/************************************************************************/ -( - superlu_options_t *options, /* input options */ - int pnum, /* process number */ - SuperMatrix *A, /* original matrix A permuted by columns (input) */ - int_t *perm_c, /* column permutation vector (input) */ - int_t *etree, /* column elimination tree (input) */ - Glu_persist_t *Glu_persist, /* output */ - Glu_freeable_t *Glu_freeable /* output */ - ) -{ -/* - * Purpose - * ======= - * symbfact() performs a symbolic factorization on matrix A and sets up - * the nonzero data structures which are suitable for supernodal Gaussian - * elimination with no pivoting (GENP). This routine features: - * o depth-first search (DFS) - * o supernodes - * o symmetric structure pruning - * - * Return value - * ============ - * < 0, number of bytes needed for LSUB. - * = 0, matrix dimension is 1. - * > 0, number of bytes allocated when out of memory. - * - */ - int_t m, n, min_mn, j, i, k, irep, nseg, pivrow, info; - int_t *iwork, *perm_r, *segrep, *repfnz; - int_t *xprune, *marker, *parent, *xplore; - int_t relax, *desc, *relax_end; - int_t nnzL, nnzU; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(pnum, "Enter symbfact()"); -#endif - - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN(m, n); - - /* Allocate storage common to the symbolic factor routines */ - info = symbfact_SubInit(DOFACT, NULL, 0, m, n, ((NCPformat*)A->Store)->nnz, - Glu_persist, Glu_freeable); - - iwork = (int_t *) intMalloc_dist(6*m+2*n); - perm_r = iwork; - segrep = iwork + m; - repfnz = segrep + m; - marker = repfnz + m; - parent = marker + m; - xplore = parent + m; - xprune = xplore + m; - relax_end = xprune + n; - relax = sp_ienv_dist(2); - ifill_dist(perm_r, m, EMPTY); - ifill_dist(repfnz, m, EMPTY); - ifill_dist(marker, m, EMPTY); - Glu_persist->supno[0] = -1; - Glu_persist->xsup[0] = 0; - Glu_freeable->xlsub[0] = 0; - Glu_freeable->xusub[0] = 0; - - /*for (j = 0; j < n; ++j) iperm_c[perm_c[j]] = j;*/ - - /* Identify relaxed supernodes. */ - if ( !(desc = intMalloc_dist(n+1)) ) - ABORT("Malloc fails for desc[]");; - relax_snode(n, etree, relax, desc, relax_end); - SUPERLU_FREE(desc); - - for (j = 0; j < min_mn; ) { - if ( relax_end[j] != EMPTY ) { /* beginning of a relaxed snode */ - k = relax_end[j]; /* end of the relaxed snode */ - - /* Determine union of the row structure of supernode (j:k). */ - if ( (info = snode_dfs(A, j, k, xprune, marker, - Glu_persist, Glu_freeable)) != 0 ) - return info; - - for (i = j; i <= k; ++i) - pivotL(i, perm_r, &pivrow, Glu_persist, Glu_freeable); - - j = k+1; - } else { - /* Perform a symbolic factorization on column j, and detects - whether column j starts a new supernode. */ - if ((info = column_dfs(A, j, perm_r, &nseg, segrep, repfnz, - xprune, marker, parent, xplore, - Glu_persist, Glu_freeable)) != 0) - return info; - - /* Copy the U-segments to usub[*]. */ - if ((info = set_usub(min_mn, j, nseg, segrep, repfnz, - Glu_persist, Glu_freeable)) != 0) - return info; - - pivotL(j, perm_r, &pivrow, Glu_persist, Glu_freeable); - - /* Prune columns [0:j-1] using column j. */ - pruneL(j, perm_r, pivrow, nseg, segrep, repfnz, xprune, - Glu_persist, Glu_freeable); - - /* Reset repfnz[*] to prepare for the next column. */ - for (i = 0; i < nseg; i++) { - irep = segrep[i]; - repfnz[irep] = EMPTY; - } - - ++j; - } /* else */ - } /* for j ... */ - - countnz_dist(min_mn, xprune, &nnzL, &nnzU, Glu_persist, Glu_freeable); - - /* Apply perm_r to L; Compress LSUB array. */ - i = fixupL_dist(min_mn, perm_r, Glu_persist, Glu_freeable); - - if ( !pnum && (options->PrintStat == YES)) { - printf("\tNonzeros in L %ld\n", nnzL); - printf("\tNonzeros in U %ld\n", nnzU); - printf("\tnonzeros in L+U %ld\n", nnzL + nnzU - min_mn); - printf("\tnonzeros in LSUB %ld\n", i); - } - SUPERLU_FREE(iwork); - -#if ( PRNTlevel>=3 ) - PrintInt10("lsub", Glu_freeable->xlsub[n], Glu_freeable->lsub); - PrintInt10("xlsub", n+1, Glu_freeable->xlsub); - PrintInt10("xprune", n, xprune); - PrintInt10("usub", Glu_freeable->xusub[n], Glu_freeable->usub); - PrintInt10("xusub", n+1, Glu_freeable->xusub); - PrintInt10("supno", n, Glu_persist->supno); - PrintInt10("xsup", (Glu_persist->supno[n])+2, Glu_persist->xsup); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(pnum, "Exit symbfact()"); -#endif - - return (-i); - -} /* SYMBFACT */ - -/************************************************************************/ -static void relax_snode -/************************************************************************/ -( - const int_t n, /* number of columns in the matrix (input) */ - int_t *et, /* column elimination tree (input) */ - const int_t relax, /* max no of columns allowed in a relaxed snode (input) */ - int_t *desc, /* number of descendants of each etree node. */ - int_t *relax_end /* last column in a supernode (output) */ - ) -{ -/* - * Purpose - * ======= - * relax_snode() identifies the initial relaxed supernodes, assuming that - * the matrix has been reordered according to an postorder of the etree. - * - */ - register int_t j, parent, nsuper; - register int_t fsupc; /* beginning of a snode */ - - ifill_dist(relax_end, n, EMPTY); - ifill_dist(desc, n+1, 0); - nsuper = 0; - - /* Compute the number of descendants of each node in the etree. */ - for (j = 0; j < n; j++) { - parent = et[j]; - if ( parent != n ) /* not the dummy root */ - desc[parent] += desc[j] + 1; - } - - /* Identify the relaxed supernodes by postorder traversal of the etree. */ - for (j = 0; j < n; ) { - parent = et[j]; - fsupc = j; - while ( parent != n && desc[parent] < relax ) { - j = parent; - parent = et[j]; - } - /* Found a supernode with j being the last column. */ - relax_end[fsupc] = j; /* Last column is recorded. */ - ++nsuper; - ++j; - /* Search for a new leaf. */ - while ( desc[j] != 0 && j < n ) ++j; - } - -#if ( DEBUGlevel>=1 ) - printf(".. No of relaxed snodes: %d\trelax: %d\n", nsuper, relax); -#endif -} /* RELAX_SNODE */ - - -/************************************************************************/ -static int_t snode_dfs -/************************************************************************/ -( - SuperMatrix *A, /* original matrix A permuted by columns (input) */ - const int_t jcol, /* beginning of the supernode (input) */ - const int_t kcol, /* end of the supernode (input) */ - int_t *xprune, /* pruned location in each adjacency list (output) */ - int_t *marker, /* working array of size m */ - Glu_persist_t *Glu_persist, /* global LU data structures (modified) */ - Glu_freeable_t *Glu_freeable - ) -{ -/* - * Purpose - * ======= - * snode_dfs() determines the union of the row structures of those - * columns within the relaxed snode. - * Note: The relaxed snodes are leaves of the supernodal etree, therefore, - * the part outside the rectangular supernode must be zero. - * - * Return value - * ============ - * 0 success; - * >0 number of bytes allocated when run out of memory. - * - */ - NCPformat *Astore; - int_t *asub, *xa_begin, *xa_end; - register int_t i, k, ifrom, ito, nextl, new_next; - int_t nsuper, krow, kmark, mem_error; - int_t *xsup, *supno; - int_t *lsub, *xlsub; - int_t nzlmax, nextu; - - Astore = A->Store; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - nzlmax = Glu_freeable->nzlmax; - nsuper = ++supno[jcol]; /* Next available supernode number */ - nextl = xlsub[jcol]; - nextu = Glu_freeable->xusub[jcol]; - - for (i = jcol; i <= kcol; i++) { - /* For each nonzero in A[*,i] */ - for (k = xa_begin[i]; k < xa_end[i]; ++k) { - krow = asub[k]; - kmark = marker[krow]; - if ( kmark != kcol ) { /* First time visit krow */ - marker[krow] = kcol; - lsub[nextl++] = krow; - if ( nextl >= nzlmax ) { - if (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, - LSUB, &nzlmax, - Glu_freeable)) - return (mem_error); - lsub = Glu_freeable->lsub; - } - } - } - supno[i] = nsuper; - Glu_freeable->xusub[i+1] = nextu; /* Tidy up the pointers in usub[*]. */ - } - - /* Supernode > 1, then make a copy of the subscripts for pruning */ - if ( jcol < kcol ) { - new_next = nextl + (nextl - xlsub[jcol]); - while ( new_next > nzlmax ) { - if (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, LSUB, - &nzlmax, Glu_freeable)) - return (mem_error); - lsub = Glu_freeable->lsub; - } - ito = nextl; - for (ifrom = xlsub[jcol]; ifrom < nextl; ) - lsub[ito++] = lsub[ifrom++]; - for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; - nextl = ito; - } - - xsup[nsuper+1] = kcol + 1; - supno[kcol+1] = nsuper; - xprune[kcol] = nextl; - xlsub[kcol+1] = nextl; -#if ( PRNTlevel>=3 ) - printf(".. snode_dfs(): (%8d:%8d) nextl %d\n", jcol, kcol, nextl); -#endif - return 0; -} /* SNODE_DFS */ - - - -/************************************************************************/ -static int_t column_dfs -/************************************************************************/ -( - SuperMatrix *A, /* original matrix A permuted by columns (input) */ - const int_t jcol, /* current column number (input) */ - int_t *perm_r, /* row permutation vector (input) */ - int_t *nseg, /* number of U-segments in column jcol (output) */ - int_t *segrep, /* list of U-segment representatives (output) */ - int_t *repfnz, /* list of first nonzeros in the U-segments (output) */ - int_t *xprune, /* pruned location in each adjacency list (output) */ - int_t *marker, /* working array of size m */ - int_t *parent, /* working array of size m */ - int_t *xplore, /* working array of size m */ - Glu_persist_t *Glu_persist, /* global LU data structures (modified) */ - Glu_freeable_t *Glu_freeable - ) -{ -/* - * Purpose - * ======= - * column_dfs() performs a symbolic factorization on column jcol, and - * detects the supernode boundary. This routine uses the row indices of - * A[*,jcol] to start the depth-first search (DFS). - * - * Output - * ====== - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. The routine returns a list of such supernodal - * representatives ( segrep[*] ) in topological order of the DFS that - * generates them. The location of the first nonzero in each such - * supernodal segment is also returned ( repfnz[*] ). - * - * Data structure - * ============== - * (lsub, xlsub): - * lsub[*] contains the compressed subscripts of the supernodes; - * xlsub[j] points to the starting location of the j-th column in - * lsub[*]; - * Storage: original row subscripts in A. - * - * During the course of symbolic factorization, we also use - * (lsub, xlsub, xprune) for the purpose of symmetric pruning. - * For each supernode {s,s+1,...,t=s+r} with first column s and last - * column t, there are two subscript sets, the last column - * structures (for pruning) will be removed in the end. - * o lsub[j], j = xlsub[s], ..., xlsub[s+1]-1 - * is the structure of column s (i.e. structure of this supernode). - * It is used for the storage of numerical values. - * o lsub[j], j = xlsub[t], ..., xlsub[t+1]-1 - * is the structure of the last column t of this supernode. - * It is for the purpose of symmetric pruning. Therefore, the - * structural subscripts can be rearranged without making physical - * interchanges among the numerical values. - * - * (1) if t > s, only the subscript sets for column s and column t - * are stored. Column t represents pruned adjacency structure. - * - * -------------------------------------------- - * lsub[*] ... | col s | col t | ... - * -------------------------------------------- - * ^ ^ ^ - * xlsub[s] xlsub[s+1] xlsub[t+1] - * : : - * : xprune[t] - * xlsub[t] - * xprune[s] - * - * (2) if t == s, i.e., a singleton supernode, the same subscript set - * is used for both G(L) and pruned graph: - * - * -------------------------------------- - * lsub[*] ... | s | ... - * -------------------------------------- - * ^ ^ - * xlsub[s] xlsub[s+1] - * xprune[s] - * - * DFS will traverse the second subscript list, i.e., the part of the - * pruned graph. - * - * Local parameters - * ================ - * nseg: no of segments in current U[*,j] - * jsuper: jsuper=EMPTY if column j does not belong to the same - * supernode as j-1. Otherwise, jsuper=nsuper. - * - * marker: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - * Return value - * ============ - * 0 success; - * > 0 number of bytes allocated when run out of space. - * - */ - NCPformat *Astore; - int_t *asub, *xa_begin, *xa_end; - int_t jcolp1, jcolm1, jsuper, nsuper, nextl; - int_t k, krep, krow, kmark, kperm; - int_t fsupc; /* first column of a supernode */ - int_t myfnz; /* first nonzero column of a U-segment */ - int_t chperm, chmark, chrep, kchild; - int_t xdfs, maxdfs, kpar, oldrep; - int_t jptr, jm1ptr; - int_t ito, ifrom, istop; /* used to compress row subscripts */ - int_t *xsup, *supno, *lsub, *xlsub; - int_t nzlmax; - static int_t first = 1, maxsuper; - int_t mem_error; - - /* Initializations */ - Astore = A->Store; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - nzlmax = Glu_freeable->nzlmax; - jcolp1 = jcol + 1; - jcolm1 = jcol - 1; - jsuper = nsuper = supno[jcol]; - nextl = xlsub[jcol]; - if ( first ) { - maxsuper = sp_ienv_dist(3); - first = 0; - } - - *nseg = 0; - - /* For each nonzero in A[*,jcol] perform depth-first search. */ - for (k = xa_begin[jcol]; k < xa_end[jcol]; ++k) { - krow = asub[k]; - kmark = marker[krow]; - - /* krow was visited before, go to the next nonzero. */ - if ( kmark == jcol ) continue; - - /* - * For each unmarked neighber krow of jcol ... - */ - marker[krow] = jcol; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - /* krow is in L: - * place it in structure of L[*,jcol]. - */ - lsub[nextl++] = krow; /* krow is indexed into A */ - if ( nextl >= nzlmax ) { - if ( mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, LSUB, - &nzlmax, Glu_freeable) ) - return (mem_error); - lsub = Glu_freeable->lsub; - } - if ( kmark != jcolm1 ) jsuper = EMPTY; /* Row index subset test */ - } else { - /* krow is in U: - * If its supernode krep has been explored, update repfnz[*]. - */ - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz[krep]; - - if ( myfnz != EMPTY ) { /* krep was visited before */ - if ( kperm < myfnz ) repfnz[krep] = kperm; - /* continue; */ - } else { - /* Otherwise perform DFS, starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - kchild = lsub[xdfs++]; - chmark = marker[kchild]; - - if ( chmark != jcol ) { /* Not reached yet */ - marker[kchild] = jcol; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,k] */ - if ( chperm == EMPTY ) { - lsub[nextl++] = kchild; - if ( nextl >= nzlmax ) { - if ( mem_error = - symbfact_SubXpand(A->ncol, jcol, nextl, - LSUB, &nzlmax, - Glu_freeable) ) - return (mem_error); - lsub = Glu_freeable->lsub; - } - if ( chmark != jcolm1 ) jsuper = EMPTY; - } else { - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep - * has been explored, update its repfnz[*]. - */ - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz[chrep]; - if ( myfnz != EMPTY ) {/* Visited before */ - if (chperm < myfnz) repfnz[chrep] = chperm; - } else { - /* Continue DFS at sup-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L') */ - parent[krep] = oldrep; - repfnz[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - } /* else */ - } /* else */ - } /* if */ - - } /* while */ - - /* krow has no more unexplored neighbors: - * place supernode-rep krep in postorder DFS; - * backtrack DFS to its parent. - */ - segrep[*nseg] = krep; - ++(*nseg); - kpar = parent[krep]; /* Pop from stack; recurse */ - if ( kpar == EMPTY ) break; /* DFS done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - } while ( kpar != EMPTY ); /* Until empty stack */ - } /* else */ - } /* else */ - } /* for each nonzero ... */ - - /* Check to see if jcol belongs in the same supernode as jcol-1 */ - if ( jcol == 0 ) { /* Do nothing for column 0 */ - nsuper = supno[0] = 0; - } else { - fsupc = xsup[nsuper]; - jptr = xlsub[jcol]; /* Not compressed yet */ - jm1ptr = xlsub[jcolm1]; - -#ifdef T2_SUPER - if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; -#endif - /* Make sure the number of columns in a supernode doesn't - exceed threshold. */ - if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; - - /* If jcol starts a new supernode, reclaim storage space in - * lsub[*] from the previous supernode. Note we only store - * the subscript set of the first and last columns of - * a supernode. (first for G(L'), last for pruned graph) - */ - if ( jsuper ==EMPTY ) { /* Starts a new supernode */ - if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ -#ifdef CHK_COMPRESS - printf(" Compress lsub[] at super %d-%d\n",fsupc,jcolm1); -#endif - ito = xlsub[fsupc+1]; - xlsub[jcolm1] = ito; - istop = ito + jptr - jm1ptr; - xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ - xlsub[jcol] = istop; - for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) - lsub[ito] = lsub[ifrom]; - nextl = ito; /* = istop + length(jcol) */ - } - ++nsuper; - supno[jcol] = nsuper; - } /* if a new supernode */ - - } /* else: jcol > 0 */ - - /* Tidy up the pointers before exit */ - xsup[nsuper+1] = jcolp1; - supno[jcolp1] = nsuper; - xprune[jcol] = nextl; /* Initialize an upper bound for pruning. */ - xlsub[jcolp1] = nextl; - return 0; -} /* COLUMN_DFS */ - -/************************************************************************/ -static int_t pivotL -/************************************************************************/ -( - const int_t jcol, /* current column number (input) */ - int_t *perm_r, /* row permutation vector (output) */ - int_t *pivrow, /* the pivot row index (output) */ - Glu_persist_t *Glu_persist, /* global LU data structures (modified) */ - Glu_freeable_t *Glu_freeable - ) -{ -/* Purpose - * ======= - * pivotL() interchanges row subscripts so that each diagonal block of a - * supernode in L has the row subscripts sorted in order of pivots. - * The row subscripts in the off-diagonal block are not sorted. - * - */ - int_t fsupc; /* first column in the supernode */ - int_t nsupc; /* number of columns in the supernode */ - int_t nsupr; /* number of rows in the supernode */ - int_t lptr; /* point_ts to the first subscript of the supernode */ - int_t diag, diagind; - int_t *lsub_ptr; - int_t isub, itemp; - int_t *lsub, *xlsub; - - /* Initialization. */ - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - fsupc = (Glu_persist->xsup)[(Glu_persist->supno)[jcol]]; - nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ - lptr = xlsub[fsupc]; - nsupr = xlsub[fsupc+1] - lptr; - lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ - - /* Search for diagonal element. */ - /* diagind = iperm_c[jcol];*/ - diagind = jcol; - diag = EMPTY; - for (isub = nsupc; isub < nsupr; ++isub) - if ( lsub_ptr[isub] == diagind ) { - diag = isub; - break; - } - - /* Diagonal pivot exists? */ - if ( diag == EMPTY ) { - printf("At column %d, ", jcol); - ABORT("pivotL() encounters zero diagonal"); - } - - /* Record pivot row. */ - *pivrow = lsub_ptr[diag]; - perm_r[*pivrow] = jcol; /* perm_r[] should be Identity. */ - /*assert(*pivrow==jcol);*/ - - /* Interchange row subscripts. */ - if ( diag != nsupc ) { - itemp = lsub_ptr[diag]; - lsub_ptr[diag] = lsub_ptr[nsupc]; - lsub_ptr[nsupc] = itemp; - } - - return 0; -} /* PIVOTL */ - - -/************************************************************************/ -static int_t set_usub -/************************************************************************/ -( - const int_t n, /* total number of columns (input) */ - const int_t jcol, /* current column number (input) */ - const int_t nseg, /* number of supernodal segments in U[*,jcol] (input) */ - int_t *segrep, /* list of U-segment representatives (output) */ - int_t *repfnz, /* list of first nonzeros in the U-segments (output) */ - Glu_persist_t *Glu_persist, /* global LU data structures (modified) */ - Glu_freeable_t *Glu_freeable - ) -{ -/* - * Purpose - * ======= - * set_usub() sets up data structure to store supernodal segments in U. - * The supernodal segments in each column are stored in topological order. - * - * NOTE - * ==== - * For each supernodal segment, we only store the index of the first - * nonzero index, rather than the indices of the whole segment, because - * those indices can be generated from first nonzero and supnodal - * representative. - * Therefore, for G(U), we store the "skeleton" of it. - * - */ - int_t ksub, krep, ksupno; - int_t k, kfnz; - int_t jsupno, nextu; - int_t new_next, mem_error; - int_t *supno; - int_t *usub, *xusub; - int_t nzumax; - - supno = Glu_persist->supno; - usub = Glu_freeable->usub; - xusub = Glu_freeable->xusub; - nzumax = Glu_freeable->nzumax; - jsupno = supno[jcol]; - nextu = xusub[jcol]; - - new_next = nextu + nseg; - while ( new_next > nzumax ) { - if (mem_error = symbfact_SubXpand(n, jcol, nextu, USUB, &nzumax, - Glu_freeable)) - return (mem_error); - usub = Glu_freeable->usub; - } - - /* We store U-segments in topological order. */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ++ksub) { - krep = segrep[k--]; - ksupno = supno[krep]; - - if ( ksupno != jsupno ) { /* Should go into usub[*] */ - kfnz = repfnz[krep]; - if ( kfnz != EMPTY ) { /* Nonzero U-segment */ - usub[nextu++] = kfnz; - -/* fsupc = xsup[ksupno]; - isub = xlsub[fsupc] + kfnz - fsupc; - irow = lsub[isub]; - usub[nextu++] = perm_r[irow];*/ - } /* if ... */ - } /* if ... */ - } /* for each segment... */ - - xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ - return 0; -} /* SET_USUB */ - - -/************************************************************************/ -static void pruneL -/************************************************************************/ -( - const int_t jcol, /* in */ - const int_t *perm_r, /* in */ - const int_t pivrow, /* in */ - const int_t nseg, /* in */ - const int_t *segrep, /* in */ - const int_t *repfnz, /* in */ - int_t *xprune, /* out */ - Glu_persist_t *Glu_persist, /* global LU data structures (modified) */ - Glu_freeable_t *Glu_freeable - ) -{ -/* - * Purpose - * ======= - * pruneL() prunes the L-structure of supernodes whose L-structure - * contains the current pivot row "pivrow". - * - */ - int_t jsupno, irep, irep1, kmin, kmax, krow; - int_t i, ktemp; - int_t do_prune; /* logical variable */ - int_t *supno; - int_t *lsub, *xlsub; - - supno = Glu_persist->supno; - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - - /* - * For each supernode-rep irep in U[*,j] - */ - jsupno = supno[jcol]; - for (i = 0; i < nseg; i++) { - irep = segrep[i]; - irep1 = irep + 1; - - /* Do not prune with a zero U-segment */ - if ( repfnz[irep] == EMPTY ) continue; - - /* - * If irep has not been pruned & it has a nonzero in row L[pivrow,i] - */ - do_prune = FALSE; - if ( supno[irep] != jsupno ) { - if ( xprune[irep] >= xlsub[irep1] ) { - kmin = xlsub[irep]; - kmax = xlsub[irep1] - 1; - for (krow = kmin; krow <= kmax; ++krow) - if ( lsub[krow] == pivrow ) { - do_prune = TRUE; - break; - } - } - - if ( do_prune ) { - /* Do a quicksort-type partition. */ - while ( kmin <= kmax ) { - if ( perm_r[lsub[kmax]] == EMPTY ) - kmax--; - else if ( perm_r[lsub[kmin]] != EMPTY ) - kmin++; - else { /* kmin below pivrow, and kmax above pivrow: - * interchange the two subscripts - */ - ktemp = lsub[kmin]; - lsub[kmin] = lsub[kmax]; - lsub[kmax] = ktemp; - kmin++; - kmax--; - } - } /* while */ - xprune[irep] = kmin; /* Pruning */ -#if ( DEBUGlevel>=3 ) - printf(".. pruneL(): use col %d: xprune[%d] = %d\n", - jcol, irep, kmin); -#endif - } /* if do_prune */ - } /* if */ - } /* for each U-segment ... */ -} /* PRUNEL */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,769 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * February 1, 2003 - * - */ - -#include -#include "superlu_ddefs.h" - - -/* Deallocate the structure pointing to the actual storage of the matrix. */ -void -Destroy_SuperMatrix_Store_dist(SuperMatrix *A) -{ - SUPERLU_FREE ( A->Store ); -} - -void -Destroy_CompCol_Matrix_dist(SuperMatrix *A) -{ - NCformat *Astore = A->Store; - SUPERLU_FREE( Astore->rowind ); - SUPERLU_FREE( Astore->colptr ); - if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval ); - SUPERLU_FREE( Astore ); -} - -void -Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A) -{ - NRformat_loc *Astore = A->Store; - SUPERLU_FREE( Astore->rowptr ); - SUPERLU_FREE( Astore->colind ); - SUPERLU_FREE( Astore->nzval ); - SUPERLU_FREE( Astore ); -} - -void -Destroy_CompRow_Matrix_dist(SuperMatrix *A) -{ - SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); - SUPERLU_FREE( ((NRformat *)A->Store)->colind ); - SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); - SUPERLU_FREE( A->Store ); -} - -void -Destroy_SuperNode_Matrix_dist(SuperMatrix *A) -{ - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); - SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); - SUPERLU_FREE ( A->Store ); -} - -/* A is of type Stype==NCP */ -void -Destroy_CompCol_Permuted_dist(SuperMatrix *A) -{ - SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); - SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); - SUPERLU_FREE ( A->Store ); -} - -/* A is of type Stype==DN */ -void -Destroy_Dense_Matrix_dist(SuperMatrix *A) -{ - DNformat* Astore = A->Store; - SUPERLU_FREE (Astore->nzval); - SUPERLU_FREE ( A->Store ); -} - -/* Destroy distributed L & U matrices. */ -void -Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) -{ - int_t i, nb, nsupers; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter Destroy_LU()"); -#endif - - nsupers = Glu_persist->supno[n-1] + 1; - - nb = CEILING(nsupers, grid->npcol); - for (i = 0; i < nb; ++i) - if ( Llu->Lrowind_bc_ptr[i] ) { - SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); - SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); - } - SUPERLU_FREE (Llu->Lrowind_bc_ptr); - SUPERLU_FREE (Llu->Lnzval_bc_ptr); - - nb = CEILING(nsupers, grid->nprow); - for (i = 0; i < nb; ++i) - if ( Llu->Ufstnz_br_ptr[i] ) { - SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); - SUPERLU_FREE (Llu->Unzval_br_ptr[i]); - } - SUPERLU_FREE (Llu->Ufstnz_br_ptr); - SUPERLU_FREE (Llu->Unzval_br_ptr); - - /* The following can be freed after factorization. */ - SUPERLU_FREE(Llu->ToRecv); - SUPERLU_FREE(Llu->ToSendD); - SUPERLU_FREE(Llu->ToSendR[0]); - SUPERLU_FREE(Llu->ToSendR); - - /* The following can be freed only after iterative refinement. */ - SUPERLU_FREE(Llu->ilsum); - SUPERLU_FREE(Llu->fmod); - SUPERLU_FREE(Llu->fsendx_plist[0]); - SUPERLU_FREE(Llu->fsendx_plist); - SUPERLU_FREE(Llu->bmod); - SUPERLU_FREE(Llu->bsendx_plist[0]); - SUPERLU_FREE(Llu->bsendx_plist); - - SUPERLU_FREE(Glu_persist->xsup); - SUPERLU_FREE(Glu_persist->supno); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit Destroy_LU()"); -#endif -} - -/* Allocate storage in ScalePermstruct */ -void ScalePermstructInit(const int_t m, const int_t n, - ScalePermstruct_t *ScalePermstruct) -{ - ScalePermstruct->DiagScale = NOEQUIL; - if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) ) - ABORT("Malloc fails for perm_r[]."); - if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) ) - ABORT("Malloc fails for perm_c[]."); -} - -/* Deallocate ScalePermstruct */ -void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct) -{ - SUPERLU_FREE(ScalePermstruct->perm_r); - SUPERLU_FREE(ScalePermstruct->perm_c); - switch ( ScalePermstruct->DiagScale ) { - case ROW: - SUPERLU_FREE(ScalePermstruct->R); - break; - case COL: - SUPERLU_FREE(ScalePermstruct->C); - break; - case BOTH: - SUPERLU_FREE(ScalePermstruct->R); - SUPERLU_FREE(ScalePermstruct->C); - break; - } -} - -/* Allocate storage in LUstruct */ -void LUstructInit(const int_t m, const int_t n, LUstruct_t *LUstruct) -{ - if ( !(LUstruct->etree = intMalloc_dist(n)) ) - ABORT("Malloc fails for etree[]."); - if ( !(LUstruct->Glu_persist = (Glu_persist_t *) - SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) - ABORT("Malloc fails for Glu_persist_t."); - if ( !(LUstruct->Llu = (LocalLU_t *) - SUPERLU_MALLOC(sizeof(LocalLU_t))) ) - ABORT("Malloc fails for LocalLU_t."); -} - -/* Deallocate LUstruct */ -void LUstructFree(LUstruct_t *LUstruct) -{ - SUPERLU_FREE(LUstruct->etree); - SUPERLU_FREE(LUstruct->Glu_persist); - SUPERLU_FREE(LUstruct->Llu); -} - -/* - * Count the total number of nonzeros in factors L and U, and in the - * symmetrically reduced L. - */ -void -countnz_dist(const int_t n, int_t *xprune, int_t *nnzL, int_t *nnzU, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) -{ - int_t fnz, fsupc, i, j, nsuper; - int_t nnzL0, jlen, irep; - int_t *supno, *xsup, *xlsub, *xusub, *usub; - - supno = Glu_persist->supno; - xsup = Glu_persist->xsup; - xlsub = Glu_freeable->xlsub; - xusub = Glu_freeable->xusub; - usub = Glu_freeable->usub; - *nnzL = 0; - *nnzU = 0; - nnzL0 = 0; - nsuper = supno[n]; - - if ( n <= 0 ) return; - - /* - * For each supernode in L. - */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jlen = xlsub[fsupc+1] - xlsub[fsupc]; - - for (j = fsupc; j < xsup[i+1]; j++) { - *nnzL += jlen; - *nnzU += j - fsupc + 1; - jlen--; - } - irep = xsup[i+1] - 1; - nnzL0 += xprune[irep] - xlsub[irep]; - } - - /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/ - - /* For each column in U. */ - for (j = 0; j < n; ++j) { - for (i = xusub[j]; i < xusub[j+1]; ++i) { - fnz = usub[i]; - fsupc = xsup[supno[fnz]+1]; - *nnzU += fsupc - fnz; - } - } -} - - - -/* - * Fix up the data storage lsub for L-subscripts. It removes the subscript - * sets for structural pruning, and applies permuation to the remaining - * subscripts. - */ -int_t -fixupL_dist(const int_t n, const int_t *perm_r, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) -{ - register int_t nsuper, fsupc, nextl, i, j, k, jstrt, lsub_size; - int_t *xsup, *lsub, *xlsub; - - if ( n <= 1 ) return 0; - - xsup = Glu_persist->xsup; - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - nextl = 0; - nsuper = (Glu_persist->supno)[n]; - lsub_size = xlsub[n]; - - /* - * For each supernode ... - */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jstrt = xlsub[fsupc]; - xlsub[fsupc] = nextl; - for (j = jstrt; j < xlsub[fsupc+1]; j++) { - lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ - nextl++; - } - for (k = fsupc+1; k < xsup[i+1]; k++) - xlsub[k] = nextl; /* Other columns in supernode i */ - - } - - xlsub[n] = nextl; - return lsub_size; -} - -/* - * Set the default values for the options argument. - */ -void set_default_options_dist(superlu_options_t *options) -{ - options->Fact = DOFACT; - options->Equil = YES; - options->ParSymbFact = NO; - options->ColPerm = MMD_AT_PLUS_A; - options->RowPerm = LargeDiag; - options->ReplaceTinyPivot = YES; - options->IterRefine = DOUBLE; - options->Trans = NOTRANS; - options->SolveInitialized = NO; - options->RefineInitialized = NO; - options->PrintStat = YES; -} - -/* - * Print the options setting. - */ -void print_options_dist(superlu_options_t *options) -{ - printf(".. options:\n"); - printf("\tFact\t %8d\n", options->Fact); - printf("\tEquil\t %8d\n", options->Equil); - printf("\tColPerm\t %8d\n", options->ColPerm); - printf("\tRowPerm\t %8d\n", options->RowPerm); - printf("\tReplaceTinyPivot %4d\n", options->ReplaceTinyPivot); - printf("\tTrans\t %8d\n", options->Trans); - printf("\tIterRefine\t%4d\n", options->IterRefine); - printf("..\n"); -} - -int_t -pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, - int_t perm_r[], int_t perm_c[], gridinfo_t *grid, - Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) -{ -/* - * Purpose - * ======= - * Set up the communication pattern for the triangular solution. - * - * Arguments - * ========= - * - * n (input) int (global) - * The dimension of the linear system. - * - * m_loc (input) int (local) - * The local row dimension of the distributed input matrix. - * - * nrhs (input) int (global) - * Number of right-hand sides. - * - * fst_row (input) int (global) - * The row number of matrix B's first row in the global matrix. - * - * perm_r (input) int* (global) - * The row permutation vector. - * - * perm_c (input) int* (global) - * The column permutation vector. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - */ - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; - int_t *row_to_proc; - int_t i, gbi, k, l, num_diag_procs, *diag_procs; - int_t irow, lk, q, knsupc, nsupers, *xsup, *supno; - int iam, p, pkk, procs; - pxgstrs_comm_t *gstrs_comm; - - procs = grid->nprow * grid->npcol; - iam = grid->iam; - gstrs_comm = SOLVEstruct->gstrs_comm; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n-1] + 1; - row_to_proc = SOLVEstruct->row_to_proc; - - /* ------------------------------------------------------------ - SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. - ------------------------------------------------------------*/ - if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) - ABORT("Malloc fails for B_to_X_itemp[]."); - SendCnt = itemp; - SendCnt_nrhs = itemp + procs; - RecvCnt = itemp + 2*procs; - RecvCnt_nrhs = itemp + 3*procs; - sdispls = itemp + 4*procs; - sdispls_nrhs = itemp + 5*procs; - rdispls = itemp + 6*procs; - rdispls_nrhs = itemp + 7*procs; - - /* Count the number of elements to be sent to each diagonal process.*/ - for (p = 0; p < procs; ++p) SendCnt[p] = 0; - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { - irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum( irow ); - p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ - ++SendCnt[p]; - } - - /* Set up the displacements for alltoall. */ - MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); - sdispls[0] = rdispls[0] = 0; - for (p = 1; p < procs; ++p) { - sdispls[p] = sdispls[p-1] + SendCnt[p-1]; - rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; - } - for (p = 0; p < procs; ++p) { - SendCnt_nrhs[p] = SendCnt[p] * nrhs; - sdispls_nrhs[p] = sdispls[p] * nrhs; - RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; - rdispls_nrhs[p] = rdispls[p] * nrhs; - } - - /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ - gstrs_comm->B_to_X_SendCnt = SendCnt; - - /* ------------------------------------------------------------ - SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. - ------------------------------------------------------------*/ - /* This is freed in pxgstrs_finalize(). */ - if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) - ABORT("Malloc fails for X_to_B_itemp[]."); - SendCnt = itemp; - SendCnt_nrhs = itemp + procs; - RecvCnt = itemp + 2*procs; - RecvCnt_nrhs = itemp + 3*procs; - sdispls = itemp + 4*procs; - sdispls_nrhs = itemp + 5*procs; - rdispls = itemp + 6*procs; - rdispls_nrhs = itemp + 7*procs; - - /* Count the number of X entries to be sent to each process.*/ - for (p = 0; p < procs; ++p) SendCnt[p] = 0; - num_diag_procs = SOLVEstruct->num_diag_procs; - diag_procs = SOLVEstruct->diag_procs; - - for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number */ - irow = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) { -#if 0 - q = row_to_proc[inv_perm_c[irow]]; -#else - q = row_to_proc[irow]; -#endif - ++SendCnt[q]; - ++irow; - } - } - } - } - - MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); - sdispls[0] = rdispls[0] = 0; - sdispls_nrhs[0] = rdispls_nrhs[0] = 0; - SendCnt_nrhs[0] = SendCnt[0] * nrhs; - RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; - for (p = 1; p < procs; ++p) { - sdispls[p] = sdispls[p-1] + SendCnt[p-1]; - rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; - sdispls_nrhs[p] = sdispls[p] * nrhs; - rdispls_nrhs[p] = rdispls[p] * nrhs; - SendCnt_nrhs[p] = SendCnt[p] * nrhs; - RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; - } - - /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ - gstrs_comm->X_to_B_SendCnt = SendCnt; - - if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) - ABORT("Malloc fails for ptr_to_ibuf[]."); - gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; - gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; - -} /* PXGSTRS_INIT */ - - -void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) -{ - SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); - SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); - SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); - SUPERLU_FREE(gstrs_comm); -} - - -/* - * Diagnostic print of segment info after panel_dfs(). - */ -void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, - int_t *segrep, int_t *repfnz) -{ - int_t j, k; - - for (j = jcol; j < jcol+w; j++) { - printf("\tcol %d:\n", j); - for (k = 0; k < nseg; k++) - printf("\t\tseg %d, segrep %d, repfnz %d\n", k, - segrep[k], repfnz[(j-jcol)*n + segrep[k]]); - } - -} - -void -PStatInit(SuperLUStat_t *stat) -{ - register int_t i; - - if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) ) - ABORT("Malloc fails for stat->utime[]"); - if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) ) - ABORT("SUPERLU_MALLOC fails for stat->ops[]"); - for (i = 0; i < NPHASES; ++i) { - stat->utime[i] = 0.; - stat->ops[i] = 0.; - } - stat->TinyPivots = stat->RefineSteps = 0; -} - -void -PStatPrint(superlu_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) -{ - double *utime = stat->utime; - flops_t *ops = stat->ops; - int_t iam = grid->iam; - flops_t flopcnt, factflop, solveflop; - - if ( options->PrintStat == NO ) return; - - if ( !iam && options->Fact != FACTORED ) { - if ( options->Equil != NO ) - printf("\tEQUIL time %8.2f\n", utime[EQUIL]); - if ( options->RowPerm != NOROWPERM ) - printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); - if ( options->ColPerm != NATURAL ) - printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); - printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); - printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); - - } - - MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM, - 0, grid->comm); - factflop = flopcnt; - if ( !iam && options->Fact != FACTORED ) { - printf("\tFACTOR time %8.2f\n", utime[FACT]); - if ( utime[FACT] != 0.0 ) - printf("\tFactor flops\t%e\tMflops \t%8.2f\n", - flopcnt, - flopcnt*1e-6/utime[FACT]); - } - - MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, - 0, grid->comm); - solveflop = flopcnt; - if ( !iam ) { - printf("\tSOLVE time %8.2f\n", utime[SOLVE]); - if ( utime[SOLVE] != 0.0 ) - printf("\tSolve flops\t%e\tMflops \t%8.2f\n", - flopcnt, - flopcnt*1e-6/utime[SOLVE]); - } - - if ( !iam && options->IterRefine != NOREFINE ) { - printf("\tREFINEMENT time %8.2f\tSteps%8d\n\n", - utime[REFINE], stat->RefineSteps); - } - -#if ( PROFlevel>=1 ) - fflush(stdout); - MPI_Barrier( grid->comm ); - - { - int_t i, P = grid->nprow*grid->npcol; - flops_t b, maxflop; - if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); - for (i = 0; i < P; ++i) { - if ( iam == i) { - printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - if ( !iam ) printf("\n.. FACT ops distribution:\n"); - for (i = 0; i < P; ++i) { - if ( iam == i ) { - printf("\t\t(%d)\t%e\n", iam, ops[FACT]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - if ( !iam ) { - b = factflop/P/maxflop; - printf("\tFACT load balance: %.2f\n", b); - } - if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); - for (i = 0; i < P; ++i) { - if ( iam == i ) { - printf("\t\t%d\t%e\n", iam, ops[SOLVE]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm); - if ( !iam ) { - b = solveflop/P/maxflop; - printf("\tSOLVE load balance: %.2f\n", b); - } - } -#endif - -/* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ -} - -void -PStatFree(SuperLUStat_t *stat) -{ - SUPERLU_FREE(stat->utime); - SUPERLU_FREE(stat->ops); -} - -/* - * Fills an integer array with a given value. - */ -void ifill_dist(int_t *a, int_t alen, int_t ival) -{ - register int_t i; - for (i = 0; i < alen; i++) a[i] = ival; -} - - -void -get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) -{ - int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; - int_t *xsup; - - i = j = *num_diag_procs = pkk = 0; - nprow = grid->nprow; - npcol = grid->npcol; - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - - do { - ++(*num_diag_procs); - i = (++i) % nprow; - j = (++j) % npcol; - pkk = PNUM( i, j, grid ); - } while ( pkk != 0 ); /* Until wrap back to process 0 */ - if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) - ABORT("Malloc fails for diag_procs[]"); - if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) - ABORT("Calloc fails for diag_len[]"); - for (i = j = k = 0; k < *num_diag_procs; ++k) { - pkk = PNUM( i, j, grid ); - (*diag_procs)[k] = pkk; - i = (++i) % nprow; - j = (++j) % npcol; - } - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - i = k % *num_diag_procs; - (*diag_len)[i] += knsupc; - } -} - - -/* - * Get the statistics of the supernodes - */ -#define NBUCKS 10 -static int_t max_sup_size; - -void super_stats_dist(int_t nsuper, int_t *xsup) -{ - register int_t nsup1 = 0; - int_t i, isize, whichb, bl, bh; - int_t bucket[NBUCKS]; - - max_sup_size = 0; - - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - if ( isize == 1 ) nsup1++; - if ( max_sup_size < isize ) max_sup_size = isize; - } - - printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1); - printf("\tmax supernode size = %d\n", max_sup_size); - printf("\tno of size 1 supernodes = %d\n", nsup1); - - /* Histogram of the supernode sizes */ - ifill_dist (bucket, NBUCKS, 0); - - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - whichb = (float) isize / max_sup_size * NBUCKS; - if (whichb >= NBUCKS) whichb = NBUCKS - 1; - bucket[whichb]++; - } - - printf("\tHistogram of supernode sizes:\n"); - for (i = 0; i < NBUCKS; i++) { - bl = (float) i * max_sup_size / NBUCKS; - bh = (float) (i+1) * max_sup_size / NBUCKS; - printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]); - } - -} - - -/* - * Check whether repfnz[] == EMPTY after reset. - */ -void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz) -{ - int_t jj, k; - - for (jj = jcol; jj < jcol+w; jj++) - for (k = 0; k < n; k++) - if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { - fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, - k, repfnz[(jj-jcol)*n + k]); - ABORT("check_repfnz_dist"); - } -} - -void PrintInt10(char *name, int_t len, int_t *x) -{ - register int_t i; - - printf("%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9); - printf("%6d", x[i]); - } - printf("\n"); -} - -int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x) -{ - register int_t i; - - fprintf(fp, "%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9); - fprintf(fp, "%6d", x[i]); - } - fprintf(fp, "\n"); -} - -int_t -CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt) -{ - register int_t i, j, zd, numzd = 0; - - for (j = 0; j < n; ++j) { - zd = 0; - for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) { - /*if ( iperm[rowind[i]] == j ) zd = 1;*/ - if ( rowind[i] == j ) { zd = 1; break; } - } - if ( zd == 0 ) { -#if ( PRNTlevel>=2 ) - printf(".. Diagonal of column %d is zero.\n", j); -#endif - ++numzd; - } - } - - return numzd; -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util_dist.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util_dist.h --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util_dist.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util_dist.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,150 +0,0 @@ -#ifndef __SUPERLU_UTIL /* allow multiple inclusions */ -#define __SUPERLU_UTIL - -#include -#include -#include -#include - -/* - * Macros - */ -#ifndef USER_ABORT -#define USER_ABORT(msg) superlu_abort_and_exit_dist(msg) -#endif - -#define ABORT(err_msg) \ - { char msg[256];\ - sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\ - USER_ABORT(msg); } - - -#ifndef USER_MALLOC -#define USER_MALLOC(size) superlu_malloc_dist(size) -#endif - -#define SUPERLU_MALLOC(size) USER_MALLOC(size) - -#ifndef USER_FREE -#define USER_FREE(addr) superlu_free_dist(addr) -#endif - -#define SUPERLU_FREE(addr) USER_FREE(addr) - -#define CHECK_MALLOC(pnum, where) { \ - extern long int superlu_malloc_total; \ - printf("(%d) %s: superlu_malloc_total (KBytes) %.3f\n", \ - pnum, where, superlu_malloc_total*1e-3); \ -} - -#define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) ) -#define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) ) - -/* - * Constants - */ -#define EMPTY (-1) -#ifndef FALSE -#define FALSE (0) -#endif -#ifndef TRUE -#define TRUE (1) -#endif - -/* - * Type definitions - */ -typedef float flops_t; -typedef unsigned char Logical; - -/* -#ifdef _CRAY -#define int short -#endif -*/ - -/* - * The following enumerate type is used by the statistics variable - * to keep track of flop count and time spent at various stages. - * - * Note that not all of the fields are disjoint. - */ -typedef enum { - COLPERM, /* find a column ordering that minimizes fills */ - ROWPERM, /* find a row ordering maximizes diagonal. */ - RELAX, /* find artificial supernodes */ - ETREE, /* compute column etree */ - EQUIL, /* equilibrate the original matrix */ - SYMBFAC, /* symbolic factorization. */ - DIST, /* distribute matrix. */ - FACT, /* perform LU factorization */ - COMM, /* communication for factorization */ - SOL_COMM,/* communication for solve */ - RCOND, /* estimate reciprocal condition number */ - SOLVE, /* forward and back solves */ - REFINE, /* perform iterative refinement */ - FLOAT, /* time spent in floating-point operations */ - TRSV, /* fraction of FACT spent in xTRSV */ - GEMV, /* fraction of FACT spent in xGEMV */ - FERR, /* estimate error bounds after iterative refinement */ - NPHASES /* total number of phases */ -} PhaseType; - -typedef struct { - int *panel_histo; /* histogram of panel size distribution */ - double *utime; /* running time at various phases */ - flops_t *ops; /* operation count at various phases */ - int TinyPivots; /* number of tiny pivots */ - int RefineSteps; /* number of iterative refinement steps */ -} SuperLUStat_t; - -/* Headers for 2 types of dynamatically managed memory */ -typedef struct e_node { - int size; /* length of the memory that has been used */ - void *mem; /* pointer to the new malloc'd store */ -} ExpHeader; - -typedef struct { - int size; - int used; - int top1; /* grow upward, relative to &array[0] */ - int top2; /* grow downward */ - void *array; -} LU_stack_t; - -/* Constants */ -#define GluIntArray(n) (5 * (n) + 5) -#define NO_MEMTYPE 4 /* 0: lusup; - 1: ucol; - 2: lsub; - 3: usub */ - -/* Macros to manipulate stack */ -#define StackFull(x) ( x + stack.used >= stack.size ) -#define NotDoubleAlign(addr) ( (long)addr & 7 ) -#define DoubleAlign(addr) ( ((long)addr + 7) & ~7L ) -#define TempSpace(n, w) ( (2*w + 4 + NO_MARKER)*m*sizeof(int) + \ - (w + 1)*n*sizeof(double) ) -#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ - -#define FIRSTCOL_OF_SNODE(i) (xsup[i]) - -#if ( PROFlevel>=1 ) -#define TIC(t) t = SuperLU_timer_() -#define TOC(t2, t1) t2 = SuperLU_timer_() - t1 -#else -#define TIC(t) -#define TOC(t2, t1) -#endif - -/********************************************************* - * Macros used for easy access of sparse matrix entries. * - *********************************************************/ -#define L_SUB_START(col) ( Lstore->rowind_colptr[col] ) -#define L_SUB(ptr) ( Lstore->rowind[ptr] ) -#define L_NZ_START(col) ( Lstore->nzval_colptr[col] ) -#define L_FST_SUPC(superno) ( Lstore->sup_to_col[superno] ) -#define U_NZ_START(col) ( Ustore->colptr[col] ) -#define U_SUB(ptr) ( Ustore->rowind[ptr] ) - -#endif /* __SUPERLU_UTIL */ diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/xerbla.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/xerbla.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/xerbla.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/xerbla.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -#include "Cnames.h" - -/* Subroutine */ int xerbla_(char *srname, int *info) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - XERBLA is an error handler for the LAPACK routines. - It is called by an LAPACK routine if an input parameter has an - invalid value. A message is printed and execution stops. - - Installers may consider modifying the STOP statement in order to - call system-specific exception-handling facilities. - - Arguments - ========= - - SRNAME (input) CHARACTER*6 - The name of the routine which called XERBLA. - - INFO (input) INT - The position of the invalid parameter in the parameter list - - of the calling routine. - - ===================================================================== -*/ - - printf("** On entry to %6s, parameter number %2d had an illegal value\n", - srname, *info); - -/* End of XERBLA */ - - return 0; -} /* xerbla_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,724 +0,0 @@ - -#include "superlu_zdefs.h" - -int_t -zdistribute(fact_t fact, int_t n, SuperMatrix *A, - Glu_freeable_t *Glu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * > 0, working storage required (in bytes). - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, - len, len1, nsupc; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NCPformat *Astore; - doublecomplex *a; - int_t *asub; - int_t *xa_begin, *xa_end; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - doublecomplex *lusup, *uval; /* nonzero values in L and U */ - doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ - doublecomplex zero = {0.0, 0.0}; - int_t ldaspa; /* LDA of SPA */ - int_t mem_use = 0, iword, zword; - -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif -#if ( PROFlevel>=1 ) - double t, t_u, t_l; - int_t u_blks; -#endif - - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; -#if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - zword = sizeof(doublecomplex); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter zdistribute()"); -#endif - - if ( fact == SamePattern_SameRowPerm ) { - /* --------------------------------------------------------------- - * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. - * --------------------------------------------------------------- */ - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doublecomplexCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; -#if ( PRNTlevel>=1 ) - mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword; -#endif -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - - /* Initialize Uval to zero. */ - for (lb = 0; lb < nrbu; ++lb) { - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - index = Ufstnz_br_ptr[lb]; - if ( index ) { - uval = Unzval_br_ptr[lb]; - len = index[1]; - for (i = 0; i < len; ++i) uval[i] = zero; - } /* if index != NULL */ - } /* for lb ... */ - - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA (for L), or into U directly. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - if ( gb < jb ) { /* in U */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - while ( (k = index[Urb_indptr[lb]]) < jb ) { - /* Skip nonzero values in this block */ - Urb_length[lb] += index[Urb_indptr[lb]+1]; - /* Move pointer to the next block */ - Urb_indptr[lb] += UB_DESCRIPTOR - + SuperSize( k ); - } - /*assert(k == jb);*/ - /* start fstnz */ - istart = Urb_indptr[lb] + UB_DESCRIPTOR; - len = Urb_length[lb]; - fsupc1 = FstBlockC( gb+1 ); - k = j - fsupc; - /* Sum the lengths of the leading columns */ - for (jj = 0; jj < k; ++jj) - len += fsupc1 - index[istart++]; - /*assert(irow>=index[istart]);*/ - uval[len + irow - index[istart]] = a[i]; - } else { /* in L; put in SPA first */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", - t_l, t_u, u_blks, nrbu); -#endif - - } else { - /* -------------------------------------------------- - * FIRST TIME CREATING THE L AND U DATA STRUCTURE. - * -------------------------------------------------- */ - -#if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; -#endif - /* No L and U data structures are available yet. - We need to set up the L and U data structures and propagate - the values of A into them. */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for ToRecv[]."); - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) - ABORT("Malloc fails for index[]."); -#if ( PRNTlevel>=1 ) - mem_use += k*sizeof(int_t*) + (j + nsupers)*iword; -#endif - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = intCalloc_dist(k)) ) - ABORT("Malloc fails for ToSendD[]."); - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); -#if ( PRNTlevel>=1 ) - mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword; -#endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - - - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) ) - ABORT("Malloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Urb_fstnz[lb] = BR_HEADER; - } /* for lb ... */ - - SUPERLU_FREE(Ucbs); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); -#endif -#if ( PRNTlevel>=1 ) - mem_use -= 2*k * iword; -#endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if (!(dense=doublecomplexCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) - *sp_ienv_dist(3))))) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); -#if ( PRNTlevel>=1 ) - mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*zword; -#endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; -#if ( PRNTlevel>=1 ) - mem_use += 4*k*sizeof(int_t*) + 2*len*iword; -#endif - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } - - jbrow = PROW( jb, grid ); - -#if ( PROFlevel>=1 ) - t = SuperLU_timer_(); -#endif - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - kseen = 0; - dense_col = dense; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - bsendx_plist[ljb][pr] == EMPTY ) { - bsendx_plist[ljb][pr] = YES; - ++nbsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - fsupc1 = FstBlockC( gb+1 ); - if (rb_marker[lb] <= jb) { /* First time see - the block */ - rb_marker[lb] = jb + 1; - Urb_indptr[lb] = Urb_fstnz[lb];; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - Urb_indptr[lb] += UB_DESCRIPTOR; - /* Record the first location in index[] of the - next block */ - Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; - len = Urb_indptr[lb];/* Start fstnz in index */ - index[len-1] = 0; - for (k = 0; k < nsupc; ++k) - index[len+k] = fsupc1; - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { /* Already saw the block */ - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - /* Load the numerical values */ - k = fsupc1 - irow; /* No. of nonzeros in segment */ - index[len-1] += k; /* Increment block length in - Descriptor */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (ii = 0; ii < k; ++ii) { - uval[Urb_length[lb]++] = dense_col[irow + ii]; - dense_col[irow + ii] = zero; - } - } /* if myrow == pr ... */ - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ - -#if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); -#endif - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { - fsendx_plist[ljb][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - Lrowind_bc_ptr[ljb] = index; - if (!(Lnzval_bc_ptr[ljb] = doublecomplexMalloc_dist(((size_t)len)*nsupc))) { - fprintf(stderr, "col block %d ", jb); - ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); - } - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - /* Propagate the compressed row subscripts to Lindex[], and - the initial values of A from SPA into Lnzval[]. */ - lusup = Lnzval_bc_ptr[ljb]; - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb] = NULL; - Lnzval_bc_ptr[ljb] = NULL; - } /* if nrbl ... */ -#if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ - - } /* for jb ... */ - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - -#if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); -#endif - - } /* else fact != SamePattern_SameRowPerm */ - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit zdistribute()"); -#endif - - return (mem_use); -} /* ZDISTRIBUTE */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,695 +0,0 @@ -#include "superlu_zdefs.h" - -/* - * NOTE zdistribute_mark.c - * ==== - * This version is faster for Mark Baertschy's matrices, remains to be - * tested for the other matrices. - * - * Main difference: there is no dense SPA involved when distributing A into - * the U structure. That is, the entries in upper triangle of A are loaded - * directly into U. - * - * The locations of modifications have XSL comments. - * - * Date: Apr 23 09:54:15 PDT 2001 - */ -int_t -zdistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = NCP; Dtype = Z; Mtype = GE. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - */ -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc; - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NCPformat *Astore; - doublecomplex *a; - int_t *asub; - int_t *xa_begin, *xa_end; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - doublecomplex *lusup, *uval; /* nonzero values in L and U */ - doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - - /*-- Counts to be used in factorization. --*/ - int_t *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ - doublecomplex zero = {0.0, 0.0}; - int_t ldaspa; /* LDA of SPA */ - int_t mem_use = 0, iword, zword; -#if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; -#endif - - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; -#if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - zword = sizeof(doublecomplex); -#endif - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter zdistribute()"); -#endif - - if ( fact == SamePattern_SameRowPerm ) { - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - for (lb = 0; lb < nrbu; ++lb) - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; -#if ( PRNTlevel>=1 ) - mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword; -#endif - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } - - /* Gather the values of A from SPA into Unzval[]. */ - for (lb = 0; lb < nrbu; ++lb) { - index = Ufstnz_br_ptr[lb]; - if ( index && index[Urb_indptr[lb]] == jb ) { - uval = Unzval_br_ptr[lb]; - len = Urb_indptr[lb] + UB_DESCRIPTOR; - gb = lb * grid->nprow + myrow;/* Global block number */ - k = FstBlockC( gb+1 ); - irow = ilsum[lb] - FstBlockC( gb ); - for (jj = 0, dense_col = dense; jj < nsupc; ++jj) { - j = index[len+jj]; - for (i = j; i < k; ++i) { - uval[Urb_length[lb]++] = dense_col[irow+i]; - dense_col[irow+i] = zero; - } - dense_col += ldaspa; - } - Urb_indptr[lb] += UB_DESCRIPTOR + nsupc; - } - } /* for lb ... */ - - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ - - } /* if mycol == pc */ - } /* for jb ... */ - - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - - } else { - /* No L and U data structures are available yet. - We need to set up the L and U data structures and propagate - the values of A into them. */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for ToRecv[]."); - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index = intMalloc_dist(j)) ) - ABORT("Malloc fails for index[]."); -#if ( PRNTlevel>=1 ) - mem_use = k*sizeof(int_t*) + (j + nsupers)*iword; -#endif - for (i = 0; i < j; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; - - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = intCalloc_dist(k)) ) - ABORT("Malloc fails for ToSendD[]."); - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); -#if ( PRNTlevel>=1 ) - mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword; -#endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - - - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ -#if ( PRNTlevel>=1 ) - ++nUblocks; -#endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - /* XSL 4-23-01 */ - if ( !(Unzval_br_ptr[lb] = doublecomplexCalloc_dist(len)) ) - ABORT("Calloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - } /* for lb ... */ - - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Ucbs); -#if ( PRNTlevel>=1 ) - mem_use -= 2*k * iword; -#endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); -#if ( PRNTlevel>=1 ) - mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*zword; -#endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; -#if ( PRNTlevel>=1 ) - mem_use += 4*k*sizeof(int_t*) + 2*len*iword; -#endif - - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - if ( irow < fsupc ) continue; /* Skip U. XSL 4-23-01 */ - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } - - jbrow = PROW( jb, grid ); - - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ - - kseen = 0; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow ) - bsendx_plist[ljb][pr] = YES; - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - /* Initialize block length to 0. XSL 4-23-01 */ - index[Urb_indptr[lb]+1] = 0; - Urb_indptr[lb] += UB_DESCRIPTOR; - len = Urb_indptr[lb]; - for (k = 0; k < nsupc; ++k) - index[len+k] = FstBlockC( gb+1 ); - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - } /* if myrow == pr ... */ - } /* for i ... */ - } /* for j ... */ -#if 1 - /* XSL 4-23-01 */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - /* Gather the initial values of A directly into Uval. - (No SPA is involved.) */ - for (i = xa_begin[j]; i < xa_end[j]; ++i) { - irow = asub[i]; - if ( irow >= fsupc ) continue; /* Skip L */ - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - len = Urb_indptr[lb]; - jj = index[len]; /* First nonzero in segment */ - uval[Urb_length[lb] + irow - jj] = a[i]; - } - } - /* Now increment the index pointer for each row block */ - for (lb = 0; lb < nrbu; ++lb) { - if ( rb_marker[lb] == jb+1 ) { /* Not an empty block */ - gb = lb*grid->nprow + myrow; /* Global block # */ - index = Ufstnz_br_ptr[lb]; - jj = index[Urb_indptr[lb]]; - k = FstBlockC( gb+1 ) - jj; - Urb_length[lb] += k; - /* Increment the block length */ - index[Urb_indptr[lb]+fsupc-j-1] += k; - Urb_indptr[lb] += 1; - } - } - } /* for j = fsupc ... */ -#else - /* Figure out how many nonzeros in each block, and gather - the initial values of A from SPA into Uval. */ - for (lb = 0; lb < nrbu; ++lb) { - if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - len = Urb_indptr[lb]; - gb = lb * grid->nprow + myrow;/* Global block number */ - k = FstBlockC( gb+1 ); - irow = ilsum[lb] - FstBlockC( gb ); - for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) { - j = index[len+jj]; /* First nonzero in segment. */ - bnnz += k - j; - for (i = j; i < k; ++i) { - uval[Urb_length[lb]++] = dense_col[irow + i]; - dense_col[irow + i] = zero; - } - dense_col += ldaspa; - } - index[len-1] = bnnz; /* Set block length in Descriptor */ - Urb_indptr[lb] += nsupc; - } - } /* for lb ... */ -#endif - - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow ) - fsendx_plist[ljb][pr] = YES; - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } -#if ( PRNTlevel>=1 ) - ++nLblocks; -#endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - Lrowind_bc_ptr[ljb] = index; - if ( !(Lnzval_bc_ptr[ljb] = - doublecomplexMalloc_dist(len*nsupc)) ) { - fprintf(stderr, "col block %d ", jb); - ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); - } - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - /* Propagate the compressed row subscripts to Lindex[], and - the initial values of A from SPA into Lnzval[]. */ - lusup = Lnzval_bc_ptr[ljb]; - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - } else { - Lrowind_bc_ptr[ljb] = NULL; - Lnzval_bc_ptr[ljb] = NULL; - } /* if nrbl ... */ - - } /* if mycol == pc */ - - } /* for jb ... */ - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - -#if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", - nLblocks, nUblocks); -#endif - - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - - } /* if fact == SamePattern_SameRowPerm */ - -#if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit zdistribute()"); -#endif - - return (mem_use); -} /* ZDISTRIBUTE */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zgsequ.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,178 +0,0 @@ - - -/* - * File name: zgsequ.c - * History: Modified from LAPACK routine ZGEEQU - */ -#include -#include "superlu_zdefs.h" - -void -zgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int_t *info) -{ -/* - Purpose - ======= - - ZGSEQU_DIST computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. - - R (output) double*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) double*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) double* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) double* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) double* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= M: the i-th row of A is exactly zero - > M: the (i-M)-th column of A is exactly zero - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int i, j, irow; - double rcmin, rcmax; - double bignum, smlnum; - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("zgsequ_dist", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = (NCformat *) A->Store; - Aval = (doublecomplex *) Astore->nzval; - - /* Get machine constants. */ - smlnum = dlamch_("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[i]) ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = 0; i < A->nrow; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - c[j] = SUPERLU_MAX( c[j], z_abs1(&Aval[i]) * r[irow] ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - return; - -} /* zgsequ_dist */ - - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlangs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - - -/* - * File name: zlangs.c - * History: Modified from lapack routine ZLANGE - */ -#include -#include "superlu_zdefs.h" - -double zlangs_dist(char *norm, SuperMatrix *A) -{ -/* - Purpose - ======= - - ZLANGS_DIST returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - ZLANGE returns the value - - ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in ZLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int i, j, irow; - double value=0., sum; - double *rwork; - - Astore = A->Store; - Aval = Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - value = SUPERLU_MAX( value, z_abs( &Aval[i]) ); - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += z_abs( &Aval[i] ); - value = SUPERLU_MAX(value,sum); - } - - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for rwork."); - for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { - irow = Astore->rowind[i]; - rwork[irow] += z_abs( &Aval[i] ); - } - value = 0.; - for (i = 0; i < A->nrow; ++i) - value = SUPERLU_MAX(value, rwork[i]); - - SUPERLU_FREE (rwork); - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else - ABORT("Illegal norm specified."); - - return (value); - -} /* zlangs_dist */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlaqgs.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,132 +0,0 @@ - - -/* - * File name: zlaqgs.c - * History: Modified from LAPACK routine ZLAQGE - */ -#include -#include "superlu_zdefs.h" - -void -zlaqgs_dist(SuperMatrix *A, double *r, double *c, - double rowcnd, double colcnd, double amax, char *equed) -{ -/* - Purpose - ======= - - ZLAQGS_DIST equilibrates a general sparse M by N matrix A using the row - and column scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. - - R (input) double*, dimension (A->nrow) - The row scale factors for A. - - C (input) double*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) double - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) double - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) double - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int i, j, irow; - double large, small, cj; - extern double dlamch_(char *); - double temp; - - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = (NCformat *) A->Store; - Aval = (doublecomplex *) Astore->nzval; - - /* Initialize LARGE and SMALL. */ - small = dlamch_("Safe minimum") / dlamch_("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - zd_mult(&Aval[i], &Aval[i], cj); - } - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - zd_mult(&Aval[i], &Aval[i], r[irow]); - } - *(unsigned char *)equed = 'R'; - } else { - /* Row and column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - temp = cj * r[irow]; - zd_mult(&Aval[i], &Aval[i], temp); - } - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* zlaqgs_dist */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zldperm.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zldperm.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zldperm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zldperm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,159 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_zdefs.h" - -extern void mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [], - int_t*, int_t [], int_t*, int_t[], int_t*, double [], - int_t [], int_t []); - -void -zldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], - doublecomplex nzval[], int_t *perm, double u[], double v[]) -{ -/* - * Purpose - * ======= - * - * ZLDPERM finds a row permutation so that the matrix has large - * entries on the diagonal. - * - * Arguments - * ========= - * - * job (input) int - * Control the action. Possible values for JOB are: - * = 1 : Compute a row permutation of the matrix so that the - * permuted matrix has as many entries on its diagonal as - * possible. The values on the diagonal are of arbitrary size. - * HSL subroutine MC21A/AD is used for this. - * = 2 : Compute a row permutation of the matrix so that the smallest - * value on the diagonal of the permuted matrix is maximized. - * = 3 : Compute a row permutation of the matrix so that the smallest - * value on the diagonal of the permuted matrix is maximized. - * The algorithm differs from the one used for JOB = 2 and may - * have quite a different performance. - * = 4 : Compute a row permutation of the matrix so that the sum - * of the diagonal entries of the permuted matrix is maximized. - * = 5 : Compute a row permutation of the matrix so that the product - * of the diagonal entries of the permuted matrix is maximized - * and vectors to scale the matrix so that the nonzero diagonal - * entries of the permuted matrix are one in absolute value and - * all the off-diagonal entries are less than or equal to one in - * absolute value. - * Restriction: 1 <= JOB <= 5. - * - * n (input) int - * The order of the matrix. - * - * nnz (input) int - * The number of nonzeros in the matrix. - * - * adjncy (input) int*, of size nnz - * The adjacency structure of the matrix, which contains the row - * indices of the nonzeros. - * - * colptr (input) int*, of size n+1 - * The pointers to the beginning of each column in ADJNCY. - * - * nzval (input) doublecomplex*, of size nnz - * The nonzero values of the matrix. nzval[k] is the value of - * the entry corresponding to adjncy[k]. - * It is not used if job = 1. - * - * perm (output) int*, of size n - * The permutation vector. perm[i] = j means row i in the - * original matrix is in row j of the permuted matrix. - * - * u (output) double*, of size n - * If job = 5, the natural logarithms of the row scaling factors. - * - * v (output) double*, of size n - * If job = 5, the natural logarithms of the column scaling factors. - * The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j). - */ - - int_t i, liw, ldw, num; - int_t *iw, icntl[10], info[10]; - double *dw; - double *nzval_abs = doubleMalloc_dist(nnz); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Enter zldperm()"); -#endif - liw = 5*n; - if ( job == 3 ) liw = 10*n + nnz; - if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]"); - ldw = 3*n + nnz; - if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]"); - - /* Increment one to get 1-based indexing. */ - for (i = 0; i <= n; ++i) ++colptr[i]; - for (i = 0; i < nnz; ++i) ++adjncy[i]; -#if ( DEBUGlevel>=2 ) - printf("LDPERM(): n %d, nnz %d\n", n, nnz); - PrintInt10("colptr", n+1, colptr); - PrintInt10("adjncy", nnz, adjncy); -#endif - - /* - * NOTE: - * ===== - * - * MC64AD assumes that column permutation vector is defined as: - * perm(i) = j means column i of permuted A is in column j of original A. - * - * Since a symmetric permutation preserves the diagonal entries. Then - * by the following relation: - * P'(A*P')P = P'A - * we can apply inverse(perm) to rows of A to get large diagonal entries. - * But, since 'perm' defined in MC64AD happens to be the reverse of - * SuperLU's definition of permutation vector, therefore, it is already - * an inverse for our purpose. We will thus use it directly. - * - */ - mc64id_(icntl); -#if 0 - /* Suppress error and warning messages. */ - icntl[0] = -1; - icntl[1] = -1; -#endif - - for (i = 0; i < nnz; ++i) nzval_abs[i] = z_abs1(&nzval[i]); - mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval_abs, &num, perm, - &liw, iw, &ldw, dw, icntl, info); - -#if ( DEBUGlevel>=2 ) - PrintInt10("perm", n, perm); - printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); -#endif - if ( info[0] == 1 ) { /* Structurally singular */ - printf(".. The last %d permutations:\n", n-num); - PrintInt10("perm", n-num, &perm[num]); - } - - /* Restore to 0-based indexing. */ - for (i = 0; i <= n; ++i) --colptr[i]; - for (i = 0; i < nnz; ++i) --adjncy[i]; - for (i = 0; i < n; ++i) --perm[i]; - - if ( job == 5 ) - for (i = 0; i < n; ++i) { - u[i] = dw[i]; - v[i] = dw[n+i]; - } - - SUPERLU_FREE(iw); - SUPERLU_FREE(dw); - SUPERLU_FREE(nzval_abs); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(0, "Exit zldperm()"); -#endif -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmemory.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmemory.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmemory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmemory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,118 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -#include "superlu_zdefs.h" - - -/* Variables external to this file */ -extern LU_stack_t stack; - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int_t zQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, - mem_usage_t *mem_usage) -{ - register int_t dword, gb, iword, k, maxsup, nb, nsupers; - int_t *index, *xsup; - int iam, mycol, myrow; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - iword = sizeof(int_t); - dword = sizeof(doublecomplex); - maxsup = sp_ienv_dist(3); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - mem_usage->for_lu = 0; - - /* For L factor */ - nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ - for (k = 0; k < nb; ++k) { - gb = k * grid->npcol + mycol; /* Global block number. */ - if ( gb < nsupers ) { - index = Llu->Lrowind_bc_ptr[k]; - if ( index ) { - mem_usage->for_lu += (float) - ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword); - mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword); - } - } - } - - /* For U factor */ - nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */ - for (k = 0; k < nb; ++k) { - gb = k * grid->nprow + myrow; /* Global block number. */ - if ( gb < nsupers ) { - index = Llu->Ufstnz_br_ptr[k]; - if ( index ) { - mem_usage->for_lu += (float)(index[2] * iword); - mem_usage->for_lu += (float)(index[1] * dword); - } - } - } - - /* Working storage to support factorization */ - mem_usage->total = mem_usage->for_lu; - mem_usage->total += - (float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword + - ( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword ); - /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/ - mem_usage->total += - (float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword ); - mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword; - k = CEILING( nsupers, grid->nprow ); - mem_usage->total += (float)(2 * k * iword); - - return 0; -} /* zQuerySpace_dist */ - - -/* - * Allocate storage for original matrix A - */ -void -zallocateA_dist(int_t n, int_t nnz, doublecomplex **a, int_t **asub, int_t **xa) -{ - *a = (doublecomplex *) doublecomplexMalloc_dist(nnz); - *asub = (int_t *) intMalloc_dist(nnz); - *xa = (int_t *) intMalloc_dist(n+1); -} - - -doublecomplex *doublecomplexMalloc_dist(int_t n) -{ - doublecomplex *buf; - buf = (doublecomplex *) - SUPERLU_MALLOC(SUPERLU_MAX(1, n) * sizeof(doublecomplex)); - return (buf); -} - -doublecomplex *doublecomplexCalloc_dist(int_t n) -{ - doublecomplex *buf; - register int_t i; - doublecomplex zero = {0.0, 0.0}; - buf = (doublecomplex *) - SUPERLU_MALLOC(SUPERLU_MAX(1, n) * sizeof(doublecomplex)); - if ( !buf ) return (buf); - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmyblas2.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmyblas2.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmyblas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmyblas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,184 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: zmyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ -#include "dcomplex.h" - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs ) -{ - int k; - doublecomplex x0, x1, x2, x3, temp; - doublecomplex *M0; - doublecomplex *Mki0, *Mki1, *Mki2, *Mki3; - register int firstcol = 0; - - M0 = &M[0]; - - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x1, &rhs[firstcol+1], &temp); - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x2, &rhs[firstcol+2], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&x2, &x2, &temp); - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x3, &rhs[firstcol+3], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&x3, &x3, &temp); - zz_mult(&temp, &x2, Mki2); Mki2++; - z_sub(&x3, &x3, &temp); - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) { - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x2, Mki2); Mki2++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x3, Mki3); Mki3++; - z_sub(&rhs[k], &rhs[k], &temp); - } - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x1, &rhs[firstcol+1], &temp); - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) { - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&rhs[k], &rhs[k], &temp); - } - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -zusolve ( - int ldm, /* in */ - int ncol, /* in */ - doublecomplex *M, /* in */ - doublecomplex *rhs /* modified */ -) -{ - doublecomplex xj, temp; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) { - zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */ - z_sub(&rhs[irow], &rhs[irow], &temp); - } - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void zmatvec ( - int ldm, /* in -- leading dimension of M */ - int nrow, /* in */ - int ncol, /* in */ - doublecomplex *M, /* in */ - doublecomplex *vec, /* in */ - doublecomplex *Mxvec /* in/out */ -) -{ - doublecomplex vi0, vi1, vi2, vi3; - doublecomplex *M0, temp; - doublecomplex *Mki0, *Mki1, *Mki2, *Mki3; - register int firstcol = 0; - int k; - - M0 = &M[0]; - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) { - zz_mult(&temp, &vi0, Mki0); Mki0++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - zz_mult(&temp, &vi1, Mki1); Mki1++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - zz_mult(&temp, &vi2, Mki2); Mki2++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - zz_mult(&temp, &vi3, Mki3); Mki3++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - } - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) { - zz_mult(&temp, &vi0, Mki0); Mki0++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - } - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zreadhb.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zreadhb.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zreadhb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zreadhb.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,264 +0,0 @@ -#include -#include -#include "dcomplex.h" -#include "superlu_zdefs.h" - -/* - * Prototypes - */ -static void ReadVector(FILE *, int_t, int_t *, int_t, int_t); -static void zReadValues(FILE *, int_t, doublecomplex *, int_t, int_t); -static int zDumpLine(FILE *); -static int ParseIntFormat(char *, int_t *, int_t *); -static int ParseFloatFormat(char *, int_t *, int_t *); - - -void -zreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz, - doublecomplex **nzval, int_t **rowind, int_t **colptr) -{ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - * - * Purpose - * ======= - * - * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format - * as described below. - * - * Line 1 (A72,A8) - * Col. 1 - 72 Title (TITLE) - * Col. 73 - 80 Key (KEY) - * - * Line 2 (5I14) - * Col. 1 - 14 Total number of lines excluding header (TOTCRD) - * Col. 15 - 28 Number of lines for pointers (PTRCRD) - * Col. 29 - 42 Number of lines for row (or variable) indices (INDCRD) - * Col. 43 - 56 Number of lines for numerical values (VALCRD) - * Col. 57 - 70 Number of lines for right-hand sides (RHSCRD) - * (including starting guesses and solution vectors - * if present) - * (zero indicates no right-hand side data is present) - * - * Line 3 (A3, 11X, 4I14) - * Col. 1 - 3 Matrix type (see below) (MXTYPE) - * Col. 15 - 28 Number of rows (or variables) (NROW) - * Col. 29 - 42 Number of columns (or elements) (NCOL) - * Col. 43 - 56 Number of row (or variable) indices (NNZERO) - * (equal to number of entries for assembled matrices) - * Col. 57 - 70 Number of elemental matrix entries (NELTVL) - * (zero in the case of assembled matrices) - * Line 4 (2A16, 2A20) - * Col. 1 - 16 Format for pointers (PTRFMT) - * Col. 17 - 32 Format for row (or variable) indices (INDFMT) - * Col. 33 - 52 Format for numerical values of coefficient matrix (VALFMT) - * Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) - * - * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present - * Col. 1 Right-hand side type: - * F for full storage or M for same format as matrix - * Col. 2 G if a starting vector(s) (Guess) is supplied. (RHSTYP) - * Col. 3 X if an exact solution vector(s) is supplied. - * Col. 15 - 28 Number of right-hand sides (NRHS) - * Col. 29 - 42 Number of row indices (NRHSIX) - * (ignored in case of unassembled matrices) - * - * The three character type field on line 3 describes the matrix type. - * The following table lists the permitted values for each of the three - * characters. As an example of the type field, RSA denotes that the matrix - * is real, symmetric, and assembled. - * - * First Character: - * R Real matrix - * C Complex matrix - * P Pattern only (no numerical values supplied) - * - * Second Character: - * S Symmetric - * U Unsymmetric - * H Hermitian - * Z Skew symmetric - * R Rectangular - * - * Third Character: - * A Assembled - * E Elemental matrices (unassembled) - * - */ - - register int_t i, numer_lines, rhscrd = 0; - int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize; - char buf[100], type[4]; - - /* Line 1 */ - fgets(buf, 100, fp); -/* fscanf(fp, "%72c", buf); buf[72] = 0; - if ( !iam ) printf("Title: %s", buf); - fscanf(fp, "%8c", key); key[8] = 0; - if ( !iam ) printf("Key: %s\n", key); - zDumpLine(fp);*/ - - /* Line 2 */ - for (i=0; i<5; i++) { - fscanf(fp, "%14c", buf); buf[14] = 0; - tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ - if (i == 3) numer_lines = tmp; - if (i == 4 && tmp) rhscrd = tmp; - } - zDumpLine(fp); - - /* Line 3 */ - fscanf(fp, "%3c", type); - fscanf(fp, "%11c", buf); /* pad */ - type[3] = 0; -#if ( DEBUGlevel>=1 ) - if ( !iam ) printf("Matrix type %s\n", type); -#endif - - fscanf(fp, "%14c", buf); *nrow = atoi(buf); - fscanf(fp, "%14c", buf); *ncol = atoi(buf); - fscanf(fp, "%14c", buf); *nonz = atoi(buf); - fscanf(fp, "%14c", buf); tmp = atoi(buf); - - if (tmp != 0) - if ( !iam ) printf("This is not an assembled matrix!\n"); - if (*nrow != *ncol) - if ( !iam ) printf("Matrix is not square.\n"); - zDumpLine(fp); - - /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ - zallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); - - /* Line 4: format statement */ - fscanf(fp, "%16c", buf); - ParseIntFormat(buf, &colnum, &colsize); - fscanf(fp, "%16c", buf); - ParseIntFormat(buf, &rownum, &rowsize); - fscanf(fp, "%20c", buf); - ParseFloatFormat(buf, &valnum, &valsize); - fscanf(fp, "%20c", buf); - zDumpLine(fp); - - /* Line 5: right-hand side */ - if ( rhscrd ) zDumpLine(fp); /* skip RHSFMT */ - -#if ( DEBUGlevel>=1 ) - if ( !iam ) { - printf("%d rows, %d nonzeros\n", *nrow, *nonz); - printf("colnum %d, colsize %d\n", colnum, colsize); - printf("rownum %d, rowsize %d\n", rownum, rowsize); - printf("valnum %d, valsize %d\n", valnum, valsize); - } -#endif - - ReadVector(fp, *ncol+1, *colptr, colnum, colsize); - ReadVector(fp, *nonz, *rowind, rownum, rowsize); - if ( numer_lines ) { - zReadValues(fp, *nonz, *nzval, valnum, valsize); - } - - fclose(fp); - -} - -/* Eat up the rest of the current line */ -static int zDumpLine(FILE *fp) -{ - register int c; - while ((c = fgetc(fp)) != '\n') ; - return 0; -} - -static int ParseIntFormat(char *buf, int_t *num, int_t *size) -{ - char *tmp; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); - while (*tmp != 'I' && *tmp != 'i') ++tmp; - ++tmp; - *size = atoi(tmp); - return 0; -} - -static int ParseFloatFormat(char *buf, int_t *num, int_t *size) -{ - char *tmp, *period; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); - while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' - && *tmp != 'F' && *tmp != 'f') { - /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the - num picked up refers to P, which should be skipped. */ - if (*tmp=='p' || *tmp=='P') { - ++tmp; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - } else { - ++tmp; - } - } - ++tmp; - period = tmp; - while (*period != '.' && *period != ')') ++period ; - *period = '\0'; - *size = atoi(tmp); - return 0; -} - -static void -ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize) -{ - register int_t i, j, item; - char tmp, buf[100]; - - i = 0; - while (i < n) { - fgets(buf, 100, fp); /* read a line at a time */ - for (j=0; jnrow != L->ncol || L->nrow < 0 ) *info = -4; - else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; - if ( *info ) { - i = -(*info); - xerbla_("sp_ztrsv", &i); - return 0; - } - - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( !(work = doublecomplexCalloc_dist(L->nrow)) ) - ABORT("Malloc fails for work in sp_ztrsv()."); - - if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L)*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - nrow = nsupr - nsupc; - - solve_ops += 4 * nsupc * (nsupc - 1); - solve_ops += 8 * nrow * nsupc; - - if ( nsupc == 1 ) { - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { - irow = L_SUB(iptr); - ++luptr; - zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]); - z_sub(&x[irow], &x[irow], &comp_zero); - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#else - ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); - - zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1); -#endif -#else - zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); - - zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], - &x[fsupc], &work[0] ); -#endif - - iptr = istart + nsupc; - for (i = 0; i < nrow; ++i, ++iptr) { - irow = L_SUB(iptr); - z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ - work[i] = comp_zero; - - } - } - } /* for k ... */ - - } else { - /* Form x := inv(U)*x */ - - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 4 * nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); - for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { - irow = U_SUB(i); - zz_mult(&comp_zero, &x[fsupc], &Uval[i]); - z_sub(&x[irow], &x[irow], &comp_zero); - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); -#endif -#else - zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); -#endif - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); - i++) { - irow = U_SUB(i); - zz_mult(&comp_zero, &x[jcol], &Uval[i]); - z_sub(&x[irow], &x[irow], &comp_zero); - } - } - } - } /* for k ... */ - - } - } else { /* Form x := inv(A')*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L')*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 8 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - zz_mult(&comp_zero, &x[irow], &Lval[i]); - z_sub(&x[jcol], &x[jcol], &comp_zero); - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += 4 * nsupc * (nsupc - 1); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); -#endif -#else - ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := inv(U')*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - zz_mult(&comp_zero, &x[irow], &Uval[i]); - z_sub(&x[jcol], &x[jcol], &comp_zero); - } - } - - solve_ops += 4 * nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx, 1, 1, 1); -#endif -#else - ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } - - /*SuperLUStat.ops[SOLVE] += solve_ops;*/ - SUPERLU_FREE(work); - return 0; -} - - - -int -sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, - doublecomplex *x, int incx, doublecomplex beta, - doublecomplex *y, int incy) -{ -/* Purpose - ======= - - sp_zgemv() performs one of the matrix-vector operations - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - where alpha and beta are scalars, x and y are vectors and A is a - sparse A->nrow by A->ncol matrix. - - Parameters - ========== - - TRANS - (input) char* - On entry, TRANS specifies the operation to be performed as - follows: - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - ALPHA - (input) doublecomplex - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - - X - (input) doublecomplex*, array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - - INCX - (input) int - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - - BETA - (input) doublecomplex - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - - Y - (output) doublecomplex*, array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - updated vector y. - - INCY - (input) int - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - - ==== Sparse Level 2 Blas routine. -*/ - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int info; - doublecomplex temp, temp1; - int lenx, leny, i, j, irow; - int iy, jx, jy, kx, ky; - int notran; - doublecomplex comp_zero = {0.0, 0.0}; - doublecomplex comp_one = {1.0, 0.0}; - - notran = lsame_(trans, "N"); - Astore = A->Store; - Aval = Astore->nzval; - - /* Test the input parameters */ - info = 0; - if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; - else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; - else if (incx == 0) info = 5; - else if (incy == 0) info = 8; - if (info != 0) { - xerbla_("sp_zgemv ", &info); - return 0; - } - - /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || - z_eq(&alpha, &comp_zero) && - z_eq(&beta, &comp_one)) - return 0; - - - /* Set LENX and LENY, the lengths of the vectors x and y, and set - up the start points in X and Y. */ - if (lsame_(trans, "N")) { - lenx = A->ncol; - leny = A->nrow; - } else { - lenx = A->nrow; - leny = A->ncol; - } - if (incx > 0) kx = 0; - else kx = - (lenx - 1) * incx; - if (incy > 0) ky = 0; - else ky = - (leny - 1) * incy; - - /* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - /* First form y := beta*y. */ - if ( !z_eq(&beta, &comp_one) ) { - if (incy == 1) { - if ( z_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) y[i] = comp_zero; - else - for (i = 0; i < leny; ++i) - zz_mult(&y[i], &beta, &y[i]); - } else { - iy = ky; - if ( z_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) { - y[iy] = comp_zero; - iy += incy; - } - else - for (i = 0; i < leny; ++i) { - zz_mult(&y[iy], &beta, &y[iy]); - iy += incy; - } - } - } - - if ( z_eq(&alpha, &comp_zero) ) return 0; - - if ( notran ) { - /* Form y := alpha*A*x + y. */ - jx = kx; - if (incy == 1) { - for (j = 0; j < A->ncol; ++j) { - if ( !z_eq(&x[jx], &comp_zero) ) { - zz_mult(&temp, &alpha, &x[jx]); - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - zz_mult(&temp1, &temp, &Aval[i]); - z_add(&y[irow], &y[irow], &temp1); - } - } - jx += incx; - } - } else { - ABORT("Not implemented."); - } - } else { - /* Form y := alpha*A'*x + y. */ - jy = ky; - if (incx == 1) { - for (j = 0; j < A->ncol; ++j) { - temp = comp_zero; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - zz_mult(&temp1, &Aval[i], &x[irow]); - z_add(&temp, &temp, &temp1); - } - zz_mult(&temp1, &alpha, &temp); - z_add(&y[jy], &y[jy], &temp1); - jy += incy; - } - } else { - ABORT("Not implemented."); - } - } - return 0; -} /* sp_zgemv */ - diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ -/* - * -- Distributed SuperLU routine (version 1.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * September 1, 1999 - * - */ - -/* - * File name: sp_blas3.c - * Purpose: Sparse BLAS3, using some dense BLAS3 operations. - */ - -#include "superlu_zdefs.h" - - -int -sp_zgemm_dist(char *transa, char *transb, int m, int n, int k, - doublecomplex alpha, SuperMatrix *A, doublecomplex *b, int ldb, - doublecomplex beta, doublecomplex *c, int ldc) -{ -/* Purpose - ======= - - sp_z performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - (input) char* - On entry, TRANSA specifies the form of op( A ) to be used in - the matrix multiplication as follows: - TRANSA = 'N' or 'n', op( A ) = A. - TRANSA = 'T' or 't', op( A ) = A'. - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - Unchanged on exit. - - TRANSB - (input) char* - On entry, TRANSB specifies the form of op( B ) to be used in - the matrix multiplication as follows: - TRANSB = 'N' or 'n', op( B ) = B. - TRANSB = 'T' or 't', op( B ) = B'. - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - Unchanged on exit. - - M - (input) int - On entry, M specifies the number of rows of the matrix - op( A ) and of the matrix C. M must be at least zero. - Unchanged on exit. - - N - (input) int - On entry, N specifies the number of columns of the matrix - op( B ) and the number of columns of the matrix C. N must be - at least zero. - Unchanged on exit. - - K - (input) int - On entry, K specifies the number of columns of the matrix - op( A ) and the number of rows of the matrix op( B ). K must - be at least zero. - Unchanged on exit. - - ALPHA - (input) doublecomplex - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = Z; Mtype = GE. - In the future, more general A can be handled. - - B - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - part of the array B must contain the matrix B, otherwise - the leading n by k part of the array B must contain the - matrix B. - Unchanged on exit. - - LDB - (input) int - On entry, LDB specifies the first dimension of B as declared - in the calling (sub) program. LDB must be at least max( 1, n ). - Unchanged on exit. - - BETA - (input) doublecomplex - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then C need not be set on input. - - C - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - contain the matrix C, except when beta is zero, in which - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - ( alpha*op( A )*B + beta*C ). - - LDC - (input) int - On entry, LDC specifies the first dimension of C as declared - in the calling (sub)program. LDC must be at least max(1,m). - Unchanged on exit. - - ==== Sparse Level 3 Blas routine. -*/ - int incx = 1, incy = 1; - int j; - - for (j = 0; j < n; ++j) { - sp_zgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zutil.c --- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,492 +0,0 @@ - -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - */ - -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include "superlu_zdefs.h" - -void -zCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, - doublecomplex *nzval, int_t *rowind, int_t *colptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NCformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = (NCformat *) A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->rowind = rowind; - Astore->colptr = colptr; -} - -void -zCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n, - int_t nnz_loc, int_t m_loc, int_t fst_row, - doublecomplex *nzval, int_t *colind, int_t *rowptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NRformat_loc *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = (NRformat_loc *) A->Store; - Astore->nnz_loc = nnz_loc; - Astore->fst_row = fst_row; - Astore->m_loc = m_loc; - Astore->nzval = nzval; - Astore->colind = colind; - Astore->rowptr = rowptr; -} - -/* - * Convert a row compressed storage into a column compressed storage. - */ -void -zCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, - doublecomplex *a, int_t *colind, int_t *rowptr, - doublecomplex **at, int_t **rowind, int_t **colptr) -{ - register int i, j, col, relpos; - int_t *marker; - - /* Allocate storage for another copy of the matrix. */ - *at = (doublecomplex *) doublecomplexMalloc_dist(nnz); - *rowind = intMalloc_dist(nnz); - *colptr = intMalloc_dist(n+1); - marker = intCalloc_dist(n); - - /* Get counts of each column of A, and set up column pointers */ - for (i = 0; i < m; ++i) - for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; - (*colptr)[0] = 0; - for (j = 0; j < n; ++j) { - (*colptr)[j+1] = (*colptr)[j] + marker[j]; - marker[j] = (*colptr)[j]; - } - - /* Transfer the matrix into the compressed column storage. */ - for (i = 0; i < m; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - col = colind[j]; - relpos = marker[col]; - (*rowind)[relpos] = i; - (*at)[relpos] = a[j]; - ++marker[col]; - } - } - - SUPERLU_FREE(marker); -} - -/* Copy matrix A into matrix B. */ -void -zCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore, *Bstore; - int ncol, nnz, i; - - B->Stype = A->Stype; - B->Dtype = A->Dtype; - B->Mtype = A->Mtype; - B->nrow = A->nrow;; - B->ncol = ncol = A->ncol; - Astore = (NCformat *) A->Store; - Bstore = (NCformat *) B->Store; - Bstore->nnz = nnz = Astore->nnz; - for (i = 0; i < nnz; ++i) - ((doublecomplex *)Bstore->nzval)[i] = ((doublecomplex *)Astore->nzval)[i]; - for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; - for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; -} - - -void zPrint_CompCol_Matrix_dist(SuperMatrix *A) -{ - NCformat *Astore; - register int i; - doublecomplex *dp; - - printf("\nCompCol matrix: "); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (NCformat *) A->Store; - printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); - if ( (dp = (doublecomplex *) Astore->nzval) != NULL ) { - printf("nzval:\n"); - for (i = 0; i < Astore->nnz; ++i) printf("%f ", dp[i]); - } - printf("\nrowind:\n"); - for (i = 0; i < Astore->nnz; ++i) printf("%d ", Astore->rowind[i]); - printf("\ncolptr:\n"); - for (i = 0; i <= A->ncol; ++i) printf("%d ", Astore->colptr[i]); - printf("\nend CompCol matrix.\n"); -} - -void zPrint_Dense_Matrix_dist(SuperMatrix *A) -{ - DNformat *Astore; - register int i; - doublecomplex *dp; - - printf("\nDense matrix: "); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (DNformat *) A->Store; - dp = (doublecomplex *) Astore->nzval; - printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,Astore->lda); - printf("\nnzval: "); - for (i = 0; i < A->nrow; ++i) printf("%f ", dp[i]); - printf("\nend Dense matrix.\n"); -} - -int zPrint_CompRowLoc_Matrix_dist(SuperMatrix *A) -{ - NRformat_loc *Astore; - int_t i, nnz_loc, m_loc; - doublecomplex *dp; - - printf("\n==== CompRowLoc matrix: "); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (NRformat_loc *) A->Store; - printf("nrow %d, ncol %d\n", A->nrow,A->ncol); - nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; - printf("nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc, - Astore->fst_row); - PrintInt10("rowptr", m_loc+1, Astore->rowptr); - PrintInt10("colind", nnz_loc, Astore->colind); - if ( (dp = (doublecomplex *) Astore->nzval) != NULL ) - PrintDoublecomplex("nzval", nnz_loc, dp); - printf("==== end CompRowLoc matrix\n"); -} - -int file_zPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A) -{ - NRformat_loc *Astore; - int_t i, nnz_loc, m_loc; - doublecomplex *dp; - - fprintf(fp, "\n==== CompRowLoc matrix: "); - fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (NRformat_loc *) A->Store; - fprintf(fp, "nrow %d, ncol %d\n", A->nrow, A->ncol); - nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; - fprintf(fp, "nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc, - Astore->fst_row); - file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr); - file_PrintInt10(fp, "colind", nnz_loc, Astore->colind); - if ( (dp = (doublecomplex *) Astore->nzval) != NULL ) - file_PrintDoublecomplex(fp, "nzval", nnz_loc, dp); - fprintf(fp, "==== end CompRowLoc matrix\n"); -} - -void -zCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, doublecomplex *x, - int_t ldx, Stype_t stype, Dtype_t dtype, - Mtype_t mtype) -{ - DNformat *Xstore; - - X->Stype = stype; - X->Dtype = dtype; - X->Mtype = mtype; - X->nrow = m; - X->ncol = n; - X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); - Xstore = (DNformat *) X->Store; - Xstore->lda = ldx; - Xstore->nzval = (doublecomplex *) x; -} - -void -zCopy_Dense_Matrix_dist(int_t M, int_t N, doublecomplex *X, int_t ldx, - doublecomplex *Y, int_t ldy) -{ -/* - * Purpose - * ======= - * - * Copies a two-dimensional matrix X to another matrix Y. - */ - int i, j; - - for (j = 0; j < N; ++j) - for (i = 0; i < M; ++i) - Y[i + j*ldy] = X[i + j*ldx]; -} - -void -zCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, - doublecomplex *nzval, int_t *nzval_colptr, - int_t *rowind, int_t *rowind_colptr, - int_t *col_to_sup, int_t *sup_to_col, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - SCformat *Lstore; - - L->Stype = stype; - L->Dtype = dtype; - L->Mtype = mtype; - L->nrow = m; - L->ncol = n; - L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); - if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); - Lstore = L->Store; - Lstore->nnz = nnz; - Lstore->nsuper = col_to_sup[n]; - Lstore->nzval = nzval; - Lstore->nzval_colptr = nzval_colptr; - Lstore->rowind = rowind; - Lstore->rowind_colptr = rowind_colptr; - Lstore->col_to_sup = col_to_sup; - Lstore->sup_to_col = sup_to_col; - -} - -void -zGenXtrue_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx) -{ - int i, j; - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) { - if ( i % 2 ) x[i + j*ldx].r = 1.0; - else x[i + j*ldx].r = 2.0; - x[i + j*ldx].i = 0.0; - } -} - -/* - * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's - */ -void -zFillRHS_dist(char *trans, int_t nrhs, doublecomplex *x, int_t ldx, - SuperMatrix *A, doublecomplex *rhs, int_t ldb) -{ - doublecomplex one = {1.0, 0.0}; - doublecomplex zero = {0.0, 0.0}; - - sp_zgemm_dist(trans, "N", A->nrow, nrhs, A->ncol, one, A, - x, ldx, zero, rhs, ldb); - -} - -/* - * Fills a doublecomplex precision array with a given value. - */ -void -zfill_dist(doublecomplex *a, int_t alen, doublecomplex dval) -{ - register int_t i; - for (i = 0; i < alen; i++) a[i] = dval; -} - - - -/* - * Check the inf-norm of the error vector - */ -void zinf_norm_error_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx, - doublecomplex *xtrue, int_t ldxtrue, - gridinfo_t *grid) -{ - double err, xnorm; - doublecomplex *x_work, *xtrue_work; - doublecomplex temp; - int i, j; - - for (j = 0; j < nrhs; j++) { - x_work = &x[j*ldx]; - xtrue_work = &xtrue[j*ldxtrue]; - err = xnorm = 0.0; - for (i = 0; i < n; i++) { - z_sub(&temp, &x_work[i], &xtrue_work[i]); - err = SUPERLU_MAX(err, z_abs(&temp)); - xnorm = SUPERLU_MAX(xnorm, z_abs(&x_work[i])); - } - err = err / xnorm; - printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err); - } -} - -void PrintDoublecomplex(char *name, int_t len, doublecomplex *x) -{ - register int_t i; - - printf("%10s:\tReal\tImag\n", name); - for (i = 0; i < len; ++i) - printf("\t%d\t%.4f\t%.4f\n", i, x[i].r, x[i].i); -} - -int file_PrintDoublecomplex(FILE *fp, char *name, int_t len, doublecomplex *x) -{ - register int_t i; - - fprintf(fp, "%10s:\tReal\tImag\n", name); - for (i = 0; i < len; ++i) - fprintf(fp, "\t%d\t%.4f\t%.4f\n", i, x[i].r, x[i].i); -} - -/* - * Print the blocks in the factored matrix L. - */ -void zPrintLblocks(int_t iam, int_t nsupers, gridinfo_t *grid, - Glu_persist_t *Glu_persist, LocalLU_t *Llu) -{ - register int_t c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb; - register int_t k, mycol, r; - int_t *xsup = Glu_persist->xsup; - int_t *index; - doublecomplex *nzval; - - printf("\n(%d) L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam); - ncb = nsupers / grid->npcol; - extra = nsupers % grid->npcol; - mycol = MYCOL( iam, grid ); - if ( mycol < extra ) ++ncb; - for (lb = 0; lb < ncb; ++lb) { - index = Llu->Lrowind_bc_ptr[lb]; - if ( index ) { /* Not an empty column */ - nzval = Llu->Lnzval_bc_ptr[lb]; - nb = index[0]; - nsupr = index[1]; - gb = lb * grid->npcol + mycol; - nsupc = SuperSize( gb ); - printf("(%d) block column %d (local), # row blocks %d\n", - iam, lb, nb); - for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { - len = index[k+1]; - printf("(%d) row-block %d: block # %d\tlength %d\n", - iam, c, index[k], len); - PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]); - for (j = 0; j < nsupc; ++j) { - PrintDoublecomplex("nzval", len, &nzval[r + j*nsupr]); - } - k += LB_DESCRIPTOR + len; - r += len; - } - } - printf("(%d)", iam); - PrintInt10("ToSendR[]", grid->npcol, Llu->ToSendR[lb]); - PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); - } - printf("nfrecvx %4d\n", Llu->nfrecvx); - k = CEILING( nsupers, grid->nprow ); - PrintInt10("fmod", k, Llu->fmod); - -} /* ZPRINTLBLOCKS */ - - -/* - * Print the blocks in the factored matrix U. - */ -void zPrintUblocks(int_t iam, int_t nsupers, gridinfo_t *grid, - Glu_persist_t *Glu_persist, LocalLU_t *Llu) -{ - register int_t c, extra, jb, k, lb, len, nb, nrb, nsupc; - register int_t myrow, r; - int_t *xsup = Glu_persist->xsup; - int_t *index; - doublecomplex *nzval; - - printf("\n(%d) U BLOCKS IN ROW-MAJOR ORDER -->\n", iam); - nrb = nsupers / grid->nprow; - extra = nsupers % grid->nprow; - myrow = MYROW( iam, grid ); - if ( myrow < extra ) ++nrb; - for (lb = 0; lb < nrb; ++lb) { - index = Llu->Ufstnz_br_ptr[lb]; - if ( index ) { /* Not an empty row */ - nzval = Llu->Unzval_br_ptr[lb]; - nb = index[0]; - printf("(%d) block row %d (local), # column blocks %d\n", - iam, lb, nb); - r = 0; - for (c = 0, k = BR_HEADER; c < nb; ++c) { - jb = index[k]; - len = index[k+1]; - printf("(%d) col-block %d: block # %d\tlength %d\n", - iam, c, jb, index[k+1]); - nsupc = SuperSize( jb ); - PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]); - PrintDoublecomplex("nzval", len, &nzval[r]); - k += UB_DESCRIPTOR + nsupc; - r += len; - } - - printf("(%d) ToSendD[] %d\n", iam, Llu->ToSendD[lb]); - } - } -} /* ZPRINTUBLOCKS */ - -int -zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm, - gridinfo_t *grid) -{ - int_t procs = grid->nprow*grid->npcol; - fprintf(fp, "TotalIndSend %d\tTotalValSend %d\n", gsmv_comm->TotalIndSend, - gsmv_comm->TotalValSend); - file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start); - file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend); - file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv); - file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend); - file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv); - file_PrintInt10(fp, "SendCounts", procs, gsmv_comm->SendCounts); - file_PrintInt10(fp, "RecvCounts", procs, gsmv_comm->RecvCounts); -} - - -/* cg5.cua - b = A*x y = L\b - 0 1 + 4.0000i 1.0000 + 4.0000i - 1 0 + 5.0000i 1.3529 + 5.4118i - 2 1 + 4.0000i 1.0000 + 4.0000i - 3 2 + 3.0000i 2.0000 + 3.0000i - 4 1 + 4.0000i 3.5882 + 4.3529i - 5 1 + 4.0000i 4.1250 + 3.3202i - 6 + 5.0000i 4.4640 + 3.8632i - 7 2 + 3.0000i 2.0000 + 3.0000i - 8 2 + 3.0000i 2.0000 + 3.0000i - 9 1 + 4.0000i 1.0000 + 4.0000i - 10 1 + 4.0000i 3.5882 + 4.3529i - 11 + 5.0000i 0 + 5.0000i - 12 1 + 4.0000i 5.1793 + 4.6604i - 13 2 + 3.0000i 2.0000 + 3.0000i - 14 1 + 4.0000i 1.0000 + 4.0000i - 15 + 5.0000i 1.3529 + 5.4118i - 16 1 + 4.0000i 4.0045 + 3.8950i - 17 + 5.0000i 3.0338 + 4.6248i - 18 1 + 4.0000i 5.4495 + 2.2703i - 19 + 5.0000i 4.0980 + 3.7290i - 20 + 5.0000i 4.2680 + 3.7739i - 21 + 5.0000i 5.3514 + 2.9480i - 22 1 + 4.0000i 4.4178 + 2.0476i - 23 1 + 4.0000i 3.5615 + 2.8322i - 24 + 5.0000i 4.7526 + 2.2605i -*/ diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx hypre-2.13.0/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx --- hypre-2.11.2/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx 2017-10-20 17:42:22.000000000 +0000 @@ -21,15 +21,6 @@ #include #include -#if HAVE_SUPERLU_20 -#include "dsp_defs.h" -#include "superlu_util.h" -#endif -#if HAVE_SUPERLU -#include "slu_ddefs.h" -#include "slu_util.h" -#endif - /*------------------------------------------------------------------------- MPI definitions -------------------------------------------------------------------------*/ @@ -44,6 +35,15 @@ #include "FEI_HYPRE_Impl.h" +#ifdef HAVE_SUPERLU_20 +#include "dsp_defs.h" +#include "superlu_util.h" +#endif +#ifdef HAVE_SUPERLU +#include "slu_ddefs.h" +#include "slu_util.h" +#endif + extern "C" { int HYPRE_LSI_Search(int *, int, int); @@ -517,7 +517,7 @@ { int i, olevel; char param[256], param1[256]; -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU int nprocs; #endif @@ -566,7 +566,7 @@ else if ( ! strcmp(param, "gmres") ) solverID_ = 1; else if ( ! strcmp(param, "cgs") ) solverID_ = 2; else if ( ! strcmp(param, "bicgstab")) solverID_ = 3; -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU else if ( ! strcmp(param, "superlu") ) { MPI_Comm_size( mpiComm_, &nprocs ); @@ -3338,13 +3338,14 @@ -------------------------------------------------------------------------*/ int FEI_HYPRE_Impl::solveUsingSuperLU() { -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU int localNRows, localNnz, *countArray, irow, jcol, *cscIA, *cscJA; int colNum, index, *etree, permcSpec, lwork, panelSize, relax, info; int *permC, *permR; - double *cscAA, diagPivotThresh, dropTol, *rVec, rnorm; + double *cscAA, diagPivotThresh, *rVec, rnorm; superlu_options_t slu_options; SuperLUStat_t slu_stat; + GlobalLU_t Glu; trans_t trans; SuperMatrix superLU_Amat; SuperMatrix superLU_Lmat; @@ -3408,7 +3409,6 @@ slu_options.SymmetricMode = NO; sp_preorder(&slu_options, &superLU_Amat, permC, etree, &AC); diagPivotThresh = 1.0; - dropTol = 0.0; panelSize = sp_ienv(1); relax = sp_ienv(2); StatInit(&slu_stat); @@ -3417,9 +3417,12 @@ slu_options.Fact = DOFACT; slu_options.DiagPivotThresh = diagPivotThresh; - dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// etree, NULL, lwork, permC, permR, &superLU_Lmat, +// &superLU_Umat, &slu_stat, &info); + dgstrf(&slu_options, &AC, relax, panelSize, etree, NULL, lwork, permC, permR, &superLU_Lmat, - &superLU_Umat, &slu_stat, &info); + &superLU_Umat, &Glu, &slu_stat, &info); Destroy_CompCol_Permuted(&AC); Destroy_CompCol_Matrix(&superLU_Amat); diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx --- hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx 2017-10-20 17:42:22.000000000 +0000 @@ -1376,8 +1376,8 @@ if ( HYPreconID_ == HYMLI ) HYPRE_LSI_MLISetParams(HYPrecon_, params[i]); #else - if ( (HYOutputLevel_ & HYFEI_SPECIALMASK) >= 2 && mypid_ == 0 ) - printf(" HYPRE_LSC::MLI SetParams - MLI unavailable.\n"); +// if ( (HYOutputLevel_ & HYFEI_SPECIALMASK) >= 2 && mypid_ == 0 ) +// printf(" HYPRE_LSC::MLI SetParams - MLI unavailable.\n"); #endif } @@ -4615,11 +4615,12 @@ double HYPRE_LinSysCore::solveUsingSuperLU(int& status) { + double rnorm=-1.0; #ifdef HAVE_SUPERLU int i, nnz, nrows, ierr; int rowSize, *colInd, *new_ia, *new_ja, *ind_array; int nz_ptr, *partition, start_row, end_row; - double *colVal, *new_a, rnorm=-1.0; + double *colVal, *new_a; HYPRE_ParCSRMatrix A_csr; HYPRE_ParVector r_csr; HYPRE_ParVector b_csr; @@ -4793,12 +4794,13 @@ double HYPRE_LinSysCore::solveUsingSuperLUX(int& status) { + double rnorm=-1.0; #ifdef HAVE_SUPERLU int i, nnz, nrows, ierr; int rowSize, *colInd, *new_ia, *new_ja, *ind_array; int nz_ptr; int *partition, start_row, end_row; - double *colVal, *new_a, rnorm=-1.0; + double *colVal, *new_a; HYPRE_ParCSRMatrix A_csr; HYPRE_ParVector r_csr; HYPRE_ParVector b_csr; @@ -4812,6 +4814,7 @@ double rpg, rcond; void *work=NULL; char equed[1]; + GlobalLU_t Glu; mem_usage_t mem_usage; superlu_options_t slu_options; SuperLUStat_t slu_stat; @@ -4901,7 +4904,7 @@ slu_options.Equil = YES; slu_options.Trans = NOTRANS; slu_options.Fact = DOFACT; - slu_options.IterRefine = DOUBLE; + slu_options.IterRefine = SLU_DOUBLE; slu_options.DiagPivotThresh = 1.0; slu_options.PivotGrowth = YES; slu_options.ConditionNumber = YES; @@ -4917,9 +4920,12 @@ // solve //------------------------------------------------------------------- +// dgssvx(&slu_options, &A2, perm_c, perm_r, etree, +// equed, R, C, &L, &U, work, lwork, &B, &X, +// &rpg, &rcond, ferr, berr, &mem_usage, &slu_stat, &info); dgssvx(&slu_options, &A2, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, - &rpg, &rcond, ferr, berr, &mem_usage, &slu_stat, &info); + &rpg, &rcond, ferr, berr, &Glu, &mem_usage, &slu_stat, &info); //------------------------------------------------------------------- // print SuperLU internal information at the first step diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c --- hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c 2017-10-20 17:42:22.000000000 +0000 @@ -33,13 +33,13 @@ #include "dsuperlu_include.h" #ifdef HAVE_DSUPERLU -#include "../DSuperLU/SRC/superlu_ddefs.h" +#include "superlu_ddefs.h" typedef struct HYPRE_LSI_DSuperLU_Struct { MPI_Comm comm_; HYPRE_ParCSRMatrix Amat_; - superlu_options_t options_; + superlu_dist_options_t options_; SuperMatrix sluAmat_; ScalePermstruct_t ScalePermstruct_; SuperLUStat_t stat_; @@ -186,8 +186,9 @@ if (sluPtr->outputLevel_ < 2) sluPtr->options_.PrintStat = NO; ScalePermstructInit(sluPtr->globalNRows_, sluPtr->globalNRows_, &(sluPtr->ScalePermstruct_)); - LUstructInit(sluPtr->globalNRows_, sluPtr->globalNRows_, - &(sluPtr->LUstruct_)); +// LUstructInit(sluPtr->globalNRows_, sluPtr->globalNRows_, +// &(sluPtr->LUstruct_)); + LUstructInit(sluPtr->globalNRows_, &(sluPtr->LUstruct_)); sluPtr->berr_[0] = 0.0; PStatInit(&(sluPtr->stat_)); pdgssvx(&(sluPtr->options_), &(sluPtr->sluAmat_), diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx hypre-2.13.0/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx --- hypre-2.11.2/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx 2017-10-20 17:42:22.000000000 +0000 @@ -24,11 +24,11 @@ #include "HYPRE.h" #include "LLNL_FEI_Solver.h" -#if HAVE_SUPERLU_20 +#ifdef HAVE_SUPERLU_20 #include "dsp_defs.h" #include "superlu_util.h" #endif -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU #include "slu_ddefs.h" #include "slu_util.h" #endif @@ -77,7 +77,7 @@ int LLNL_FEI_Solver::parameters(int numParams, char **paramString) { int i, olevel; -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU int nprocs; #endif char param[256], param1[256]; @@ -124,7 +124,7 @@ else if ( !strcmp(param, "bicgstab")) solverID_ = 3; else if ( !strcmp(param, "superlu") ) { -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU MPI_Comm_size( mpiComm_, &nprocs ); if ( nprocs == 1 ) solverID_ = 4; else @@ -1101,15 +1101,16 @@ -------------------------------------------------------------------------*/ int LLNL_FEI_Solver::solveUsingSuperLU() { -#if HAVE_SUPERLU +#ifdef HAVE_SUPERLU int localNRows, localNnz, *countArray, irow, jcol, *cscIA, *cscJA; int colNum, index, *etree, permcSpec, lwork, panelSize, relax, info; int *permC, *permR, *diagIA, *diagJA; - double *cscAA, diagPivotThresh, dropTol, *rVec, rnorm; + double *cscAA, diagPivotThresh, *rVec, rnorm; double *diagAA; trans_t trans; superlu_options_t slu_options; SuperLUStat_t slu_stat; + GlobalLU_t Glu; SuperMatrix superLU_Amat; SuperMatrix superLU_Lmat; SuperMatrix superLU_Umat; @@ -1172,7 +1173,6 @@ slu_options.SymmetricMode = NO; sp_preorder(&slu_options, &superLU_Amat, permC, etree, &AC); diagPivotThresh = 1.0; - dropTol = 0.0; panelSize = sp_ienv(1); relax = sp_ienv(2); StatInit(&slu_stat); @@ -1180,9 +1180,12 @@ slu_options.ColPerm = MY_PERMC; slu_options.DiagPivotThresh = diagPivotThresh; - dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// etree, NULL, lwork, permC, permR, &superLU_Lmat, +// &superLU_Umat, &slu_stat, &info); + dgstrf(&slu_options, &AC, relax, panelSize, etree, NULL, lwork, permC, permR, &superLU_Lmat, - &superLU_Umat, &slu_stat, &info); + &superLU_Umat, &Glu, &slu_stat, &info); Destroy_CompCol_Permuted(&AC); Destroy_CompCol_Matrix(&superLU_Amat); diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/Makefile hypre-2.13.0/src/FEI_mv/fei-hypre/Makefile --- hypre-2.11.2/src/FEI_mv/fei-hypre/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/fei-hypre/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -10,18 +10,17 @@ # $Revision$ #EHEADER********************************************************************** - include ../../config/Makefile.config BASE_DIR = ${HYPRE_FEI_BASE_DIR} -SUPERLU_INCLUDE = -I$(srcdir)/../SuperLU/SRC -SUPERLU_LIB = -L$(srcdir)/../SuperLU +#SUPERLU_INCLUDE = -I$(srcdir)/../SuperLU/SRC +#SUPERLU_LIB = -L$(srcdir)/../SuperLU CINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include CXXINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include -CDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DHAVE_MLI -CXXDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DMPICH_SKIP_MPICXX -DHAVE_MLI +CDEFS = -DBOOL_NOT_SUPPORTED +CXXDEFS = ${CDEFS} -DMPICH_SKIP_MPICXX C_COMPILE_FLAGS = \ ${CDEFS}\ @@ -41,6 +40,7 @@ -I$(srcdir)/../../distributed_ls\ -I$(srcdir)/../fei-base\ ${SUPERLU_INCLUDE}\ + ${DSUPERLU_INCLUDE}\ ${CINCLUDES} CXX_COMPILE_FLAGS = \ @@ -61,6 +61,7 @@ -I$(srcdir)/../femli\ -I$(srcdir)/../fei-base\ ${SUPERLU_INCLUDE}\ + ${DSUPERLU_INCLUDE}\ ${CXXINCLUDES} HEADERS =\ diff -Nru hypre-2.11.2/src/FEI_mv/femli/Makefile hypre-2.13.0/src/FEI_mv/femli/Makefile --- hypre-2.11.2/src/FEI_mv/femli/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/femli/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,7 @@ -I.\ -I../..\ -I$(srcdir)\ + -I$(srcdir)/../../blas\ -I$(srcdir)/../../lapack\ -I$(srcdir)/../../utilities\ -I$(srcdir)/../../IJ_mv\ @@ -31,7 +32,7 @@ -I$(srcdir)/../../distributed_ls\ -I$(srcdir)/../../FEI_mv/fei-hypre\ -I$(srcdir)/../../FEI_mv/femli\ - -I$(srcdir)/../../FEI_mv/SuperLU/SRC + ${SUPERLU_INCLUDE} C_COMPILE_FLAGS =\ -DMLI_SUPERLU\ diff -Nru hypre-2.11.2/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx hypre-2.13.0/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx --- hypre-2.11.2/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx 2017-10-20 17:42:22.000000000 +0000 @@ -110,12 +110,13 @@ int nrows, iP, startRow, nnz, *csrIA, *csrJA, *cscJA, *cscIA; int irow, icol, *rowArray, *countArray, colNum, index, nSubRows; int *etree, permcSpec, lwork, panelSize, relax, info, rowCnt; - double *csrAA, *cscAA, dropTol; + double *csrAA, *cscAA; hypre_ParCSRMatrix *hypreA; hypre_CSRMatrix *ADiag; SuperMatrix AC, superLU_Amat; superlu_options_t slu_options; SuperLUStat_t slu_stat; + GlobalLU_t Glu; /* --------------------------------------------------------------- * fetch matrix @@ -281,14 +282,16 @@ slu_options.Fact = DOFACT; slu_options.SymmetricMode = NO; sp_preorder(&slu_options, &superLU_Amat, permCs_[iP], etree, &AC); - dropTol = 0.0; panelSize = sp_ienv(1); relax = sp_ienv(2); StatInit(&slu_stat); lwork = 0; - dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// etree,NULL,lwork,permCs_[iP],permRs_[iP], +// &(superLU_Lmats[iP]),&(superLU_Umats[iP]),&slu_stat,&info); + dgstrf(&slu_options, &AC, relax, panelSize, etree,NULL,lwork,permCs_[iP],permRs_[iP], - &(superLU_Lmats[iP]),&(superLU_Umats[iP]),&slu_stat,&info); + &(superLU_Lmats[iP]),&(superLU_Umats[iP]),&Glu,&slu_stat,&info); Destroy_CompCol_Permuted(&AC); Destroy_CompCol_Matrix(&superLU_Amat); delete [] etree; @@ -351,14 +354,16 @@ slu_options.Fact = DOFACT; slu_options.SymmetricMode = NO; sp_preorder(&slu_options, &superLU_Amat, permCs_[iP], etree, &AC); - dropTol = 0.0; panelSize = sp_ienv(1); relax = sp_ienv(2); StatInit(&slu_stat); lwork = 0; - dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// dgstrf(&slu_options, &AC, dropTol, relax, panelSize, +// etree,NULL,lwork,permRs_[iP],permCs_[iP],&(superLU_Lmats[iP]), +// &(superLU_Umats[iP]),&slu_stat,&info); + dgstrf(&slu_options, &AC, relax, panelSize, etree,NULL,lwork,permRs_[iP],permCs_[iP],&(superLU_Lmats[iP]), - &(superLU_Umats[iP]),&slu_stat,&info); + &(superLU_Umats[iP]),&Glu,&slu_stat,&info); Destroy_CompCol_Permuted(&AC); Destroy_CompCol_Matrix(&superLU_Amat); delete [] etree; diff -Nru hypre-2.11.2/src/FEI_mv/femli/mli_solver_superlu.cxx hypre-2.13.0/src/FEI_mv/femli/mli_solver_superlu.cxx --- hypre-2.11.2/src/FEI_mv/femli/mli_solver_superlu.cxx 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/femli/mli_solver_superlu.cxx 2017-10-20 17:42:22.000000000 +0000 @@ -59,12 +59,13 @@ int nnz, row_num, irow, i, j, rowSize, *cols, *recvCntArray; int *dispArray, itemp, *cntArray, icol, colNum, index; int *etree, permcSpec, lwork, panel_size, relax, info, mypid, nprocs; - double *vals, *csrAA, *gcsrAA, *gcscAA, diagPivotThresh, dropTol; + double *vals, *csrAA, *gcsrAA, *gcscAA, diagPivotThresh; MPI_Comm mpiComm; hypre_ParCSRMatrix *hypreA; SuperMatrix AC; superlu_options_t slu_options; SuperLUStat_t slu_stat; + GlobalLU_t Glu; /* --------------------------------------------------------------- * fetch matrix @@ -232,7 +233,6 @@ slu_options.SymmetricMode = NO; sp_preorder(&slu_options, &superLU_Amat, permC_, etree, &AC); diagPivotThresh = 1.0; - dropTol = 0.0; panel_size = sp_ienv(1); relax = sp_ienv(2); StatInit(&slu_stat); @@ -240,9 +240,12 @@ slu_options.ColPerm = MY_PERMC; slu_options.DiagPivotThresh = diagPivotThresh; - dgstrf(&slu_options, &AC, dropTol, relax, panel_size, +// dgstrf(&slu_options, &AC, dropTol, relax, panel_size, +// etree, NULL, lwork, permC_, permR_, &superLU_Lmat, +// &superLU_Umat, &slu_stat, &info); + dgstrf(&slu_options, &AC, relax, panel_size, etree, NULL, lwork, permC_, permR_, &superLU_Lmat, - &superLU_Umat, &slu_stat, &info); + &superLU_Umat, &Glu, &slu_stat, &info); Destroy_CompCol_Permuted(&AC); Destroy_CompCol_Matrix(&superLU_Amat); delete [] etree; diff -Nru hypre-2.11.2/src/FEI_mv/femli/mli_utils.c hypre-2.13.0/src/FEI_mv/femli/mli_utils.c --- hypre-2.11.2/src/FEI_mv/femli/mli_utils.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/femli/mli_utils.c 2017-10-20 17:42:22.000000000 +0000 @@ -27,9 +27,7 @@ #include "mli_utils.h" #include "HYPRE_IJ_mv.h" #include "../fei-hypre/HYPRE_parcsr_fgmres.h" -#ifdef HYPRE_USING_HYPRE_LAPACK -#include "hypre_lapack.h" -#endif +#include "_hypre_lapack.h" /*-------------------------------------------------------------------------- * external function @@ -1146,12 +1144,6 @@ int MLI_Utils_SVD(double *uArray, double *sArray, double *vtArray, double *workArray, int m, int n, int workLen) { - /* prototype */ -#if 0 - void hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(char *, char *, integer *, - integer *, doublereal *, int *, doublereal *, doublereal *, integer *, - doublereal *, integer *, doublereal *, integer *, integer *); -#endif #ifndef MIN #define MIN(a,b) ((a)<(b)?(a):(b)) #endif @@ -1161,16 +1153,12 @@ int info; info = -1; #else - extern int hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(char *, char *, int *, - int *, double *, int *, double *, double *, int *, - double *, int *, double *, int *, int *); - char jobu = 'O'; /* overwrite input with U */ char jobvt = 'S'; /* return rows of V in vtArray */ int dim = MIN(m,n); int info; - hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(&jobu, &jobvt, &m, &n, uArray, + hypre_dgesvd(&jobu, &jobvt, &m, &n, uArray, &m, sArray, (double *) NULL, &m, vtArray, &dim, workArray, &workLen, &info); #endif @@ -1189,17 +1177,13 @@ #ifdef HYPRE_USING_ESSL info = -1; #else - char jobu = 'O'; /* overwrite input with U */ char jobvt = 'N'; double *sArray = (double *) malloc(n*sizeof(double)); int workLen = 5*n; double *workArray = (double *) malloc(workLen*sizeof(double)); - extern int hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(char *, char *, int *, - int *, double *, int *, double *, double *, int *, - double *, int *, double *, int *, int *); - hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(&jobu, &jobvt, &n, &n, uArray, + hypre_dgesvd(&jobu, &jobvt, &n, &n, uArray, &n, sArray, NULL, &n, NULL, &n, workArray, &workLen, &info); free(workArray); diff -Nru hypre-2.11.2/src/FEI_mv/Makefile hypre-2.13.0/src/FEI_mv/Makefile --- hypre-2.11.2/src/FEI_mv/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -10,7 +10,6 @@ # $Revision$ #EHEADER********************************************************************** - include ../config/Makefile.config all: diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/CMakeLists.txt hypre-2.13.0/src/FEI_mv/SuperLU/CMakeLists.txt --- hypre-2.11.2/src/FEI_mv/SuperLU/CMakeLists.txt 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,59 +0,0 @@ - -set(HYPRE_superlu_HEADERS - SRC/Cnames.h - SRC/colamd.h - SRC/old_colamd.h - SRC/slu_cdefs.h - SRC/slu_Cnames.h - SRC/slu_dcomplex.h - SRC/slu_ddefs.h - SRC/slu_scomplex.h - SRC/slu_sdefs.h - SRC/slu_util.h - SRC/slu_zdefs.h - SRC/supermatrix.h -) - -set(HYPRE_superlu_SRCS - SRC/colamd.c - SRC/dcolumn_bmod.c - SRC/dcolumn_dfs.c - SRC/dcopy_to_ucol.c - SRC/dgscon.c - SRC/dgsequ.c - SRC/dgsrfs.c - SRC/dgssv.c - SRC/dgssvx.c - SRC/dgstrf.c - SRC/dgstrs.c - SRC/dlacon.c - SRC/dlangs.c - SRC/dlaqgs.c - SRC/dmemory.c - SRC/dpanel_bmod.c - SRC/dpanel_dfs.c - SRC/dpivotgrowth.c - SRC/dpivotL.c - SRC/dpruneL.c - SRC/dreadhb.c - SRC/dsnode_bmod.c - SRC/dsnode_dfs.c - SRC/dsp_blas2.c - SRC/dsp_blas3.c - SRC/dutil.c - SRC/get_perm_c.c - SRC/heap_relax_snode.c - SRC/memory.c - SRC/mmd.c - SRC/relax_snode.c - SRC/sp_coletree.c - SRC/sp_ienv.c - SRC/sp_preorder.c - SRC/superlu_timer.c - SRC/slu_util.c -) - -add_definitions(-DNO_TIMER) -add_library(HYPRE_superlu OBJECT ${HYPRE_superlu_SRCS}) - -install (FILES ${HYPRE_superlu_HEADERS} DESTINATION include) diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/Makefile hypre-2.13.0/src/FEI_mv/SuperLU/Makefile --- hypre-2.11.2/src/FEI_mv/SuperLU/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,73 +0,0 @@ -############################################################################ -# -# Program: SuperLU -# -# Module: Makefile -# -# Purpose: Top-level Makefile -# -# Creation date: October 2, 1995 -# -# Modified: February 4, 1997 Version 1.0 -# November 15, 1997 Version 1.1 -# September 1, 1999 Version 2.0 -# October 15, 2003 Version 3.0 -# -############################################################################ - -############################ -# for hypre -############################ -all: - ( cd SRC; $(MAKE) all) - -install: - ( cd SRC; $(MAKE) install) - -clean: - ( cd SRC; $(MAKE) clean) - -distclean: clean - -############################ - -#include make.inc - -#all: install lib testing - -#lib: superlulib tmglib - -#clean: cleanlib cleantesting - -#install: -# ( cd INSTALL; $(MAKE) ) -# ( cd INSTALL; cp lsame.c ../SRC/; \ -# cp dlamch.c ../SRC/; cp slamch.c ../SRC/ ) - -blaslib: - ( cd CBLAS; $(MAKE) ) - -superlulib: - ( cd SRC; $(MAKE) ) - -tmglib: - ( cd TESTING/MATGEN; $(MAKE) ) - -matlabmex: - ( cd MATLAB; $(MAKE) ) - -testing: - ( cd TESTING ; $(MAKE) ) - -cleanlib: - ( cd SRC; $(MAKE) clean ) - ( cd TESTING/MATGEN; $(MAKE) clean ) - ( cd CBLAS; $(MAKE) clean ) - -cleantesting: - ( cd INSTALL; $(MAKE) clean ) - ( cd TESTING; $(MAKE) clean ) - ( cd MATLAB; $(MAKE) clean ) - ( cd EXAMPLE; $(MAKE) clean ) - ( cd FORTRAN; $(MAKE) clean ) - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/Makefile.superlu hypre-2.13.0/src/FEI_mv/SuperLU/Makefile.superlu --- hypre-2.11.2/src/FEI_mv/SuperLU/Makefile.superlu 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/Makefile.superlu 1970-01-01 00:00:00.000000000 +0000 @@ -1,56 +0,0 @@ -############################################################################ -# -# Program: SuperLU -# -# Module: Makefile -# -# Purpose: Top-level Makefile -# -# Creation date: October 2, 1995 -# -# Modified: February 4, 1997 Version 1.0 -# November 15, 1997 Version 1.1 -# September 1, 1999 Version 2.0 -# October 15, 2003 Version 3.0 -# -############################################################################ - -include make.inc - -all: install lib testing - -lib: superlulib tmglib - -clean: cleanlib cleantesting - -install: - ( cd INSTALL; $(MAKE) ) -# ( cd INSTALL; cp lsame.c ../SRC/; \ -# cp dlamch.c ../SRC/; cp slamch.c ../SRC/ ) - -blaslib: - ( cd CBLAS; $(MAKE) ) - -superlulib: - ( cd SRC; $(MAKE) ) - -tmglib: - ( cd TESTING/MATGEN; $(MAKE) ) - -matlabmex: - ( cd MATLAB; $(MAKE) ) - -testing: - ( cd TESTING ; $(MAKE) ) - -cleanlib: - ( cd SRC; $(MAKE) clean ) - ( cd TESTING/MATGEN; $(MAKE) clean ) - ( cd CBLAS; $(MAKE) clean ) - -cleantesting: - ( cd INSTALL; $(MAKE) clean ) - ( cd TESTING; $(MAKE) clean ) - ( cd MATLAB; $(MAKE) clean ) - ( cd EXAMPLE; $(MAKE) clean ) - ( cd FORTRAN; $(MAKE) clean ) diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/make.inc hypre-2.13.0/src/FEI_mv/SuperLU/make.inc --- hypre-2.11.2/src/FEI_mv/SuperLU/make.inc 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/make.inc 1970-01-01 00:00:00.000000000 +0000 @@ -1,46 +0,0 @@ -############################################################################ -# -# Program: SuperLU -# -# Module: make.inc -# -# Purpose: Top-level Definitions -# -# Creation date: October 2, 1995 -# -# Modified: February 4, 1997 Version 1.0 -# November 15, 1997 Version 1.1 -# September 1, 1999 Version 2.0 -# -############################################################################ -# -# -# The name of the libraries to be created/linked to -# -TMGLIB = libtmglib.a -SUPERLULIB = libsuperlu_3.0.a -BLASLIB = ../libblas.a - -# -# The archiver and the flag(s) to use when building archive (library) -# If your system has no ranlib, set RANLIB = echo. -# -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib - -CC = cc -CFLAGS = -xO3 -xcg92 -FORTRAN = f77 -FFLAGS = -O -LOADER = cc -LOADOPTS = -xO3 - -# -# C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase) -# -CDEFS = -DAdd_ -# -# The directory in which Matlab is installed -# -MATLAB = /usr/sww/pkg/matlab diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/README hypre-2.13.0/src/FEI_mv/SuperLU/README --- hypre-2.11.2/src/FEI_mv/SuperLU/README 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,157 +0,0 @@ - SuperLU (Version 3.0) - ===================== - -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -(1) Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. -(2) Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. -(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of -Energy nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -SuperLU contains a set of subroutines to solve a sparse linear system -A*X=B. It uses Gaussian elimination with partial pivoting (GEPP). -The columns of A may be preordered before factorization; the -preordering for sparsity is completely separate from the factorization. - -SuperLU is implemented in ANSI C, and must be compiled with standard -ANSI C compilers. It provides functionality for both real and complex -matrices, in both single and double precision. The file names for the -single-precision real version start with letter "s" (such as sgstrf.c); -the file names for the double-precision real version start with letter "d" -(such as dgstrf.c); the file names for the single-precision complex -version start with letter "c" (such as cgstrf.c); the file names -for the double-precision complex version start with letter "z" -(such as zgstrf.c). - - -SuperLU contains the following directory structure: - - SuperLU/README instructions on installation - SuperLU/CBLAS/ needed BLAS routines in C, not necessarily fast - SuperLU/EXAMPLE/ example programs - SuperLU/FORTRAN/ Fortran interface - SuperLU/INSTALL/ test machine dependent parameters; the Users' Guide. - SuperLU/MAKE_INC/ sample machine-specific make.inc files - SuperLU/MATLAB/ Matlab mex-file interface - SuperLU/SRC/ C source code, to be compiled into the superlu.a library - SuperLU/TESTING/ driver routines to test correctness - SuperLU/Makefile top level Makefile that does installation and testing - SuperLU/make.inc compiler, compile flags, library definitions and C - preprocessor definitions, included in all Makefiles. - (You may need to edit it to be suitable for your system - before compiling the whole package.) - - -Before installing the package, please examine the three things dependent -on your system setup: - -1. Edit the make.inc include file. - This make include file is referenced inside each of the Makefiles - in the various subdirectories. As a result, there is no need to - edit the Makefiles in the subdirectories. All information that is - machine specific has been defined in this include file. - - Example machine-specific make.inc include files are provided - in the MAKE_INC/ directory for several systems, such as - IBM RS/6000, DEC Alpha, SunOS 4.x, SunOS 5.x (Solaris), HP-PA and - SGI Iris 4.x. When you have selected the machine to which you wish - to install SuperLU, copy the appropriate sample include file (if one - is present) into make.inc. For example, if you wish to run - SuperLU on an IBM RS/6000, you can do - - cp MAKE_INC/make.rs6k make.inc - - For the systems other than listed above, slight modifications to the - make.inc file will need to be made. - -2. The BLAS library. - If there is BLAS library available on your machine, you may define - the following in the file SuperLU/make.inc: - BLASDEF = -DUSE_VENDOR_BLAS - BLASLIB = - - The CBLAS/ subdirectory contains the part of the C BLAS needed by - SuperLU package. However, these codes are intended for use only if there - is no faster implementation of the BLAS already available on your machine. - In this case, you should do the following: - - 1) In SuperLU/make.inc, undefine (comment out) BLASDEF, and define: - BLASLIB = ../blas$(PLAT).a - - 2) Go to the SuperLU/ directory, type: - make blaslib - to make the BLAS library from the routines in the CBLAS/ subdirectory. - -3. C preprocessor definition CDEFS. - In the header file SRC/slu_Cnames.h, we use macros to determine how - C routines should be named so that they are callable by Fortran. - (Some vendor-supplied BLAS libraries do not have C interface. So the - re-naming is needed in order for the SuperLU BLAS calls (in C) to - interface with the Fortran-style BLAS.) - The possible options for CDEFS are: - - o -DAdd_: Fortran expects a C routine to have an underscore - postfixed to the name; - o -DNoChange: Fortran expects a C routine name to be identical to - that compiled by C; - o -DUpCase: Fortran expects a C routine name to be all uppercase. - -4. The Matlab MEX-file interface. - The MATLAB/ subdirectory includes Matlab C MEX-files, so that - our factor and solve routines can be called as alternatives to those - built into Matlab. In the file SuperLU/make.inc, define MATLAB to be the - directory in which Matlab is installed on your system, for example: - - MATLAB = /usr/local/matlab - - At the SuperLU/ directory, type "make matlabmex" to build the MEX-file - interface. After you have built the interface, you may go to the MATLAB/ - directory to test the correctness by typing (in Matlab): - trysuperlu - trylusolve - -A Makefile is provided in each subdirectory. The installation can be done -completely automatically by simply typing "make" at the top level. -The test results are in the files below: - INSTALL/install.out - TESTING/stest.out - TESTING/dtest.out - TESTING/ctest.out - TESTING/ztest.out - - - ------------------ -| RELEASE NOTES | ------------------ -* Version 3.0, 10-15-03 - - add "options" and "stat" argument for the driver routines - DGSSV/DGSSVX. This interface is more user-friendly and flexible. - - add more examples in EXAMPLE/ - - add a "symmetric mode" with better performance when the matrix is - symmetric, or diagonal dominant, or positive definite, or nearly so. - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,361 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_cdefs.h" - -/* - * Function prototypes - */ -void cusolve(int, int, complex*, complex*); -void clsolve(int, int, complex*, complex*); -void cmatvec(int, int, int, complex*, complex*, complex*); - - - -/* Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -ccolumn_bmod ( - const int jcol, /* in */ - const int nseg, /* in */ - complex *dense, /* in */ - complex *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in */ - int fpanelc, /* in -- first column in the current panel */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose: - * ======== - * Performs numeric block updates (sup-col) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - complex alpha, beta; - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in supernode - * nsupr = no of rows in supernode (used as leading dimension) - * luptr = location of supernodal LU-block in storage - * kfnz = first nonz in the k-th supernodal segment - * no_zeros = no of leading zeros in a supernodal U-segment - */ - complex ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int fsupc, nsupc, nsupr, segsze; - int nrow; /* No of rows in the matrix of matrix-vector */ - int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; - register int lptr, kfnz, isub, irow, i; - register int no_zeros, new_next; - int ufirst, nextlu; - int fst_col; /* First column within small LU update */ - int d_fsupc; /* Distance between the first column of the current - panel and the first column of the current snode. */ - int *xsup, *supno; - int *lsub, *xlsub; - complex *lusup; - int *xlusup; - int nzlumax; - complex *tempv1; - complex zero = {0.0, 0.0}; - complex one = {1.0, 0.0}; - complex none = {-1.0, 0.0}; - complex comp_temp, comp_temp1; - int mem_error; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - nzlumax = Glu->nzlumax; - jcolp1 = jcol + 1; - jsupno = supno[jcol]; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - - krep = segrep[k]; - k--; - ksupno = supno[krep]; - if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ - - fsupc = xsup[ksupno]; - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - /* Distance from the current supernode to the current panel; - d_fsupc=0 if fsupc > fpanelc. */ - d_fsupc = fst_col - fsupc; - - luptr = xlusup[fst_col] + d_fsupc; - lptr = xlsub[fsupc] + d_fsupc; - - kfnz = repfnz[krep]; - kfnz = SUPERLU_MAX ( kfnz, fpanelc ); - - segsze = krep - kfnz + 1; - nsupc = krep - fst_col + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nrow = nsupr - d_fsupc - nsupc; - krep_ind = lptr + nsupc - 1; - - - - - /* - * Case 1: Update U-segment of size 1 -- col-col update - */ - if ( segsze == 1 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - c_sub(&dense[irow], &dense[irow], &comp_temp); - luptr++; - } - - } else if ( segsze <= 3 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { /* Case 2: 2cols-col update */ - cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); - c_sub(&ukj, &ukj, &comp_temp); - dense[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&dense[irow], &dense[irow], &comp_temp); - } - } else { /* Case 3: 3cols-col update */ - ukj2 = dense[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); - c_sub(&ukj1, &ukj1, &comp_temp); - - cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); - cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&ukj, &ukj, &comp_temp); - - dense[lsub[krep_ind]] = ukj; - dense[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - luptr2++; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&dense[irow], &dense[irow], &comp_temp); - } - } - - - } else { - /* - * Case: sup-col update - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense - */ - - no_zeros = kfnz - fst_col; - - /* Copy U[*,j] segment from dense[*] to tempv[*] */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - tempv[i] = dense[irow]; - ++isub; - } - - /* Dense triangular solve -- start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#endif - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - clsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); -#endif - - - /* Scatter tempv[] into SPA dense[] as a temporary storage */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense[irow] = tempv[i]; - tempv[i] = zero; - ++isub; - } - - /* Scatter tempv1[] into SPA dense[] */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - c_sub(&dense[irow], &dense[irow], &tempv1[i]); - tempv1[i] = zero; - ++isub; - } - } - - } /* if jsupno ... */ - - } /* for each segment... */ - - /* - * Process the supernodal portion of L\U[*,j] - */ - nextlu = xlusup[jcol]; - fsupc = xsup[jsupno]; - - /* Copy the SPA dense into L\U[*,j] */ - new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; - while ( new_next > nzlumax ) { - if (mem_error = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) - return (mem_error); - lusup = Glu->lusup; - lsub = Glu->lsub; - } - - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = zero; - ++nextlu; - } - - xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ - - /* For more updates within the panel (also within the current supernode), - * should start from the first column of the panel, or the first column - * of the supernode, whichever is bigger. There are 2 cases: - * 1) fsupc < fpanelc, then fst_col := fpanelc - * 2) fsupc >= fpanelc, then fst_col := fsupc - */ - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - if ( fst_col < jcol ) { - - /* Distance between the current supernode and the current panel. - d_fsupc=0 if fsupc >= fpanelc. */ - d_fsupc = fst_col - fsupc; - - lptr = xlsub[fsupc] + d_fsupc; - luptr = xlusup[fst_col] + d_fsupc; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nsupc = jcol - fst_col; /* Excluding jcol */ - nrow = nsupr - d_fsupc - nsupc; - - /* Points to the beginning of jcol in snode L\U(jsupno) */ - ufirst = xlusup[jcol] + d_fsupc; - - ops[TRSV] += 4 * nsupc * (nsupc - 1); - ops[GEMV] += 8 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#else - ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#endif - - alpha = none; beta = one; /* y := beta*y + alpha*A*x */ - -#ifdef _CRAY - CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - - cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], tempv ); - - /* Copy updates from tempv[*] into lusup[*] */ - isub = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - c_sub(&lusup[isub], &lusup[isub], &tempv[i]); - tempv[i] = zero; - ++isub; - } - -#endif - - - } /* if fst_col < jcol ... */ - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,269 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - -/* What type of supernodes we want */ -#define T2_SUPER - -int -ccolumn_dfs( - const int m, /* in - number of rows in the matrix */ - const int jcol, /* in */ - int *perm_r, /* in */ - int *nseg, /* modified - with new segments appended */ - int *lsub_col, /* in - defines the RHS vector to start the dfs */ - int *segrep, /* modified - with new segments appended */ - int *repfnz, /* modified */ - int *xprune, /* modified */ - int *marker, /* modified */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * "column_dfs" performs a symbolic factorization on column jcol, and - * decide the supernode boundary. - * - * This routine does not use numeric values, but only use the RHS - * row indices to start the dfs. - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. The routine returns a list of such supernodal - * representatives in topological order of the dfs that generates them. - * The location of the first nonzero in each such supernodal segment - * (supernodal entry location) is also returned. - * - * Local parameters - * ================ - * nseg: no of segments in current U[*,j] - * jsuper: jsuper=EMPTY if column j does not belong to the same - * supernode as j-1. Otherwise, jsuper=nsuper. - * - * marker2: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - * Return value - * ============ - * 0 success; - * > 0 number of bytes allocated when run out of space. - * - */ - int jcolp1, jcolm1, jsuper, nsuper, nextl; - int k, krep, krow, kmark, kperm; - int *marker2; /* Used for small panel LU */ - int fsupc; /* First column of a snode */ - int myfnz; /* First nonz column of a U-segment */ - int chperm, chmark, chrep, kchild; - int xdfs, maxdfs, kpar, oldrep; - int jptr, jm1ptr; - int ito, ifrom, istop; /* Used to compress row subscripts */ - int mem_error; - int *xsup, *supno, *lsub, *xlsub; - int nzlmax; - static int first = 1, maxsuper; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - if ( first ) { - maxsuper = sp_ienv(3); - first = 0; - } - jcolp1 = jcol + 1; - jcolm1 = jcol - 1; - nsuper = supno[jcol]; - jsuper = nsuper; - nextl = xlsub[jcol]; - marker2 = &marker[2*m]; - - - /* For each nonzero in A[*,jcol] do dfs */ - for (k = 0; lsub_col[k] != EMPTY; k++) { - - krow = lsub_col[k]; - lsub_col[k] = EMPTY; - kmark = marker2[krow]; - - /* krow was visited before, go to the next nonz */ - if ( kmark == jcol ) continue; - - /* For each unmarked nbr krow of jcol - * krow is in L: place it in structure of L[*,jcol] - */ - marker2[krow] = jcol; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - lsub[nextl++] = krow; /* krow is indexed into A */ - if ( nextl >= nzlmax ) { - if ( mem_error = cLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ - } else { - /* krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz[krep]; - - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > kperm ) repfnz[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker2[kchild]; - - if ( chmark != jcol ) { /* Not reached yet */ - marker2[kchild] = jcol; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,k] */ - if ( chperm == EMPTY ) { - lsub[nextl++] = kchild; - if ( nextl >= nzlmax ) { - if ( mem_error = - cLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( chmark != jcolm1 ) jsuper = EMPTY; - } else { - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz[chrep]; - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz[chrep] = chperm; - } else { - /* Continue dfs at super-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L^t) */ - parent[krep] = oldrep; - repfnz[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - } /* else */ - - } /* else */ - - } /* if */ - - } /* while */ - - /* krow has no more unexplored nbrs; - * place supernode-rep krep in postorder DFS. - * backtrack dfs to its parent - */ - segrep[*nseg] = krep; - ++(*nseg); - kpar = parent[krep]; /* Pop from stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - - } while ( kpar != EMPTY ); /* Until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonzero ... */ - - /* Check to see if j belongs in the same supernode as j-1 */ - if ( jcol == 0 ) { /* Do nothing for column 0 */ - nsuper = supno[0] = 0; - } else { - fsupc = xsup[nsuper]; - jptr = xlsub[jcol]; /* Not compressed yet */ - jm1ptr = xlsub[jcolm1]; - -#ifdef T2_SUPER - if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; -#endif - /* Make sure the number of columns in a supernode doesn't - exceed threshold. */ - if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; - - /* If jcol starts a new supernode, reclaim storage space in - * lsub from the previous supernode. Note we only store - * the subscript set of the first and last columns of - * a supernode. (first for num values, last for pruning) - */ - if ( jsuper == EMPTY ) { /* starts a new supernode */ - if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ -#ifdef CHK_COMPRESS - printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); -#endif - ito = xlsub[fsupc+1]; - xlsub[jcolm1] = ito; - istop = ito + jptr - jm1ptr; - xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ - xlsub[jcol] = istop; - for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) - lsub[ito] = lsub[ifrom]; - nextl = ito; /* = istop + length(jcol) */ - } - nsuper++; - supno[jcol] = nsuper; - } /* if a new supernode */ - - } /* else: jcol > 0 */ - - /* Tidy up the pointers before exit */ - xsup[nsuper+1] = jcolp1; - supno[jcolp1] = nsuper; - xprune[jcol] = nextl; /* Initialize upper bound for pruning */ - xlsub[jcolp1] = nextl; - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,103 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - -int -ccopy_to_ucol( - int jcol, /* in */ - int nseg, /* in */ - int *segrep, /* in */ - int *repfnz, /* in */ - int *perm_r, /* in */ - complex *dense, /* modified - reset to zero on return */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Gather from SPA dense[*] to global ucol[*]. - */ - int ksub, krep, ksupno; - int i, k, kfnz, segsze; - int fsupc, isub, irow; - int jsupno, nextu; - int new_next, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - complex *ucol; - int *usub, *xusub; - int nzumax; - - complex zero = {0.0, 0.0}; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - nzumax = Glu->nzumax; - - jsupno = supno[jcol]; - nextu = xusub[jcol]; - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - krep = segrep[k--]; - ksupno = supno[krep]; - - if ( ksupno != jsupno ) { /* Should go into ucol[] */ - kfnz = repfnz[krep]; - if ( kfnz != EMPTY ) { /* Nonzero U-segment */ - - fsupc = xsup[ksupno]; - isub = xlsub[fsupc] + kfnz - fsupc; - segsze = krep - kfnz + 1; - - new_next = nextu + segsze; - while ( new_next > nzumax ) { - if (mem_error = cLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)) - return (mem_error); - ucol = Glu->ucol; - if (mem_error = cLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)) - return (mem_error); - usub = Glu->usub; - lsub = Glu->lsub; - } - - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - usub[nextu] = perm_r[irow]; - ucol[nextu] = dense[irow]; - dense[irow] = zero; - nextu++; - isub++; - } - - } - - } - - } /* for each segment... */ - - xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgscon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgscon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgscon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,143 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: cgscon.c - * History: Modified from lapack routines CGECON. - */ -#include -#include "slu_cdefs.h" - -void -cgscon(char *norm, SuperMatrix *L, SuperMatrix *U, - float anorm, float *rcond, SuperLUStat_t *stat, int *info) -{ -/* - Purpose - ======= - - CGSCON estimates the reciprocal of the condition number of a general - real matrix A, in either the 1-norm or the infinity-norm, using - the LU factorization computed by CGETRF. - - An estimate is obtained for norm(inv(A)), and the reciprocal of the - condition number is computed as - RCOND = 1 / ( norm(A) * norm(inv(A)) ). - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - NORM (input) char* - Specifies whether the 1-norm condition number or the - infinity-norm condition number is required: - = '1' or 'O': 1-norm; - = 'I': Infinity-norm. - - L (input) SuperMatrix* - The factor L from the factorization Pr*A*Pc=L*U as computed by - cgstrf(). Use compressed row subscripts storage for supernodes, - i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. - - U (input) SuperMatrix* - The factor U from the factorization Pr*A*Pc=L*U as computed by - cgstrf(). Use column-wise storage scheme, i.e., U has types: - Stype = SLU_NC, Dtype = SLU_C, Mtype = TRU. - - ANORM (input) float - If NORM = '1' or 'O', the 1-norm of the original matrix A. - If NORM = 'I', the infinity-norm of the original matrix A. - - RCOND (output) float* - The reciprocal of the condition number of the matrix A, - computed as RCOND = 1/(norm(A) * norm(inv(A))). - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - - ===================================================================== -*/ - - /* Local variables */ - int kase, kase1, onenrm, i; - float ainvnm; - complex *work; - extern int crscl_(int *, complex *, complex *, int *); - - extern int clacon_(int *, complex *, complex *, float *, int *); - - - /* Test the input parameters. */ - *info = 0; - onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); - if (! onenrm && ! lsame_(norm, "I")) *info = -1; - else if (L->nrow < 0 || L->nrow != L->ncol || - L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU) - *info = -2; - else if (U->nrow < 0 || U->nrow != U->ncol || - U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU) - *info = -3; - if (*info != 0) { - i = -(*info); - xerbla_("cgscon", &i); - return; - } - - /* Quick return if possible */ - *rcond = 0.; - if ( L->nrow == 0 || U->nrow == 0) { - *rcond = 1.; - return; - } - - work = complexCalloc( 3*L->nrow ); - - - if ( !work ) - ABORT("Malloc fails for work arrays in cgscon."); - - /* Estimate the norm of inv(A). */ - ainvnm = 0.; - if ( onenrm ) kase1 = 1; - else kase1 = 2; - kase = 0; - - do { - clacon_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase); - - if (kase == 0) break; - - if (kase == kase1) { - /* Multiply by inv(L). */ - sp_ctrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); - - /* Multiply by inv(U). */ - sp_ctrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); - - } else { - - /* Multiply by inv(U'). */ - sp_ctrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); - - /* Multiply by inv(L'). */ - sp_ctrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); - - } - - } while ( kase != 0 ); - - /* Compute the estimate of the reciprocal condition number. */ - if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; - - SUPERLU_FREE (work); - return; - -} /* cgscon */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsequ.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,184 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: cgsequ.c - * History: Modified from LAPACK routine CGEEQU - */ -#include -#include "slu_cdefs.h" - -void -cgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd, - float *colcnd, float *amax, int *info) -{ -/* - Purpose - ======= - - CGSEQU computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NC; Dtype = SLU_C; Mtype = SLU_GE. - - R (output) float*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) float*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) float* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) float* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) float* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= A->nrow: the i-th row of A is exactly zero - > A->ncol: the (i-M)-th column of A is exactly zero - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - complex *Aval; - int i, j, irow; - float rcmin, rcmax; - float bignum, smlnum; - extern double slamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("cgsequ", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - - /* Get machine constants. */ - smlnum = slamch_("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - r[irow] = SUPERLU_MAX( r[irow], c_abs1(&Aval[i]) ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = 0; i < A->nrow; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - c[j] = SUPERLU_MAX( c[j], c_abs1(&Aval[i]) * r[irow] ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - return; - -} /* cgsequ */ - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsrfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsrfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsrfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,447 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: cgsrfs.c - * History: Modified from lapack routine CGERFS - */ -#include -#include "slu_cdefs.h" - -void -cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, char *equed, float *R, float *C, - SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * CGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates for - * the solution. - * - * If equilibration was performed, the system becomes: - * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * A (input) SuperMatrix* - * The original matrix A in the system, or the scaled A if - * equilibration was done. The type of A can be: - * Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_GE. - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * cgstrf(). Use column-wise storage scheme, - * i.e., U has types: Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (A->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * equed (input) Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by - * diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * - * R (input) float*, dimension (A->nrow) - * The row scale factors for A. - * If equed = 'R' or 'B', A is premultiplied by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * - * C (input) float*, dimension (A->ncol) - * The column scale factors for A. - * If equed = 'C' or 'B', A is postmultiplied by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * - * B (input) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. - * The right hand side matrix B. - * if equed = 'R' or 'B', B is premultiplied by diag(R). - * - * X (input/output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. - * On entry, the solution matrix X, as computed by cgstrs(). - * On exit, the improved solution matrix X. - * if *equed = 'C' or 'B', X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * FERR (output) float*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * - * BERR (output) float*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if INFO = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 5 - - /* Table of constant values */ - int ione = 1; - complex ndone = {-1., 0.}; - complex done = {1., 0.}; - - /* Local variables */ - NCformat *Astore; - complex *Aval; - SuperMatrix Bjcol; - DNformat *Bstore, *Xstore, *Bjcol_store; - complex *Bmat, *Xmat, *Bptr, *Xptr; - int kase; - float safe1, safe2; - int i, j, k, irow, nz, count, notran, rowequ, colequ; - int ldb, ldx, nrhs; - float s, xk, lstres, eps, safmin; - char transc[1]; - trans_t transt; - complex *work; - float *rwork; - int *iwork; - extern double slamch_(char *); - extern int clacon_(int *, complex *, complex *, float *, int *); -#ifdef _CRAY - extern int CCOPY(int *, complex *, int *, complex *, int *); - extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *); -#else - extern int ccopy_(int *, complex *, int *, complex *, int *); - extern int caxpy_(int *, complex *, complex *, int *, complex *, int *); -#endif - - Astore = A->Store; - Aval = Astore->nzval; - Bstore = B->Store; - Xstore = X->Store; - Bmat = Bstore->nzval; - Xmat = Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - /* Test the input parameters */ - *info = 0; - notran = (trans == NOTRANS); - if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE ) - *info = -2; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) - *info = -3; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) - *info = -4; - else if ( ldb < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) - *info = -10; - else if ( ldx < SUPERLU_MAX(0, A->nrow) || - X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE ) - *info = -11; - if (*info != 0) { - i = -(*info); - xerbla_("cgsrfs", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || nrhs == 0) { - for (j = 0; j < nrhs; ++j) { - ferr[j] = 0.; - berr[j] = 0.; - } - return; - } - - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - - /* Allocate working space */ - work = complexMalloc(2*A->nrow); - rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) ); - iwork = intMalloc(A->nrow); - if ( !work || !rwork || !iwork ) - ABORT("Malloc fails for work/rwork/iwork."); - - if ( notran ) { - *(unsigned char *)transc = 'N'; - transt = TRANS; - } else { - *(unsigned char *)transc = 'T'; - transt = NOTRANS; - } - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = slamch_("Epsilon"); - safmin = slamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - - /* Compute the number of nonzeros in each row (or column) of A */ - for (i = 0; i < A->nrow; ++i) iwork[i] = 0; - if ( notran ) { - for (k = 0; k < A->ncol; ++k) - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - ++iwork[Astore->rowind[i]]; - } else { - for (k = 0; k < A->ncol; ++k) - iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; - } - - /* Copy one column of RHS B into Bjcol. */ - Bjcol.Stype = B->Stype; - Bjcol.Dtype = B->Dtype; - Bjcol.Mtype = B->Mtype; - Bjcol.nrow = B->nrow; - Bjcol.ncol = 1; - Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); - Bjcol_store = Bjcol.Store; - Bjcol_store->lda = ldb; - Bjcol_store->nzval = work; /* address aliasing */ - - /* Do for each right hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - Bptr = &Bmat[j*ldb]; - Xptr = &Xmat[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - -#ifdef _CRAY - CCOPY(&A->nrow, Bptr, &ione, work, &ione); -#else - ccopy_(&A->nrow, Bptr, &ione, work, &ione); -#endif - sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione); - - /* Compute componentwise relative backward error from formula - max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) - where abs(Z) is the componentwise absolute value of the matrix - or vector Z. If the i-th component of the denominator is less - than SAFE2, then SAFE1 is added to the i-th component of the - numerator and denominator before dividing. */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if (notran) { - for (k = 0; k < A->ncol; ++k) { - xk = c_abs1( &Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - s += c_abs1(&Aval[i]) * c_abs1(&Xptr[irow]); - } - rwork[k] += s; - } - } - s = 0.; - for (i = 0; i < A->nrow; ++i) { - if (rwork[i] > safe2) - s = SUPERLU_MAX( s, c_abs1(&work[i]) / rwork[i] ); - else - s = SUPERLU_MAX( s, (c_abs1(&work[i]) + safe1) / - (rwork[i] + safe1) ); - } - berr[j] = s; - - /* Test stopping criterion. Continue iterating if - 1) The residual BERR(J) is larger than machine epsilon, and - 2) BERR(J) decreased by at least a factor of 2 during the - last iteration, and - 3) At most ITMAX iterations tried. */ - - if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { - /* Update solution and try again. */ - cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - -#ifdef _CRAY - CAXPY(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#else - caxpy_(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#endif - lstres = berr[j]; - ++count; - } else { - break; - } - - } /* end while */ - - stat->RefineSteps = count; - - /* Bound error from formula: - norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* - ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) - where - norm(Z) is the magnitude of the largest component of Z - inv(op(A)) is the inverse of op(A) - abs(Z) is the componentwise absolute value of the matrix or - vector Z - NZ is the maximum number of nonzeros in any row of A, plus 1 - EPS is machine epsilon - - The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) - is incremented by SAFE1 if the i-th component of - abs(op(A))*abs(X) + abs(B) is less than SAFE2. - - Use CLACON to estimate the infinity-norm of the matrix - inv(op(A)) * diag(W), - where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if ( notran ) { - for (k = 0; k < A->ncol; ++k) { - xk = c_abs1( &Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - xk = c_abs1( &Xptr[irow] ); - s += c_abs1(&Aval[i]) * xk; - } - rwork[k] += s; - } - } - - for (i = 0; i < A->nrow; ++i) - if (rwork[i] > safe2) - rwork[i] = c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; - else - rwork[i] = c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; - kase = 0; - - do { - clacon_(&A->nrow, &work[A->nrow], work, - &ferr[j], &kase); - if (kase == 0) break; - - if (kase == 1) { - /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) { - cs_mult(&work[i], &work[i], C[i]); - } - else if ( !notran && rowequ ) - for (i = 0; i < A->nrow; ++i) { - cs_mult(&work[i], &work[i], R[i]); - } - - cgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); - - for (i = 0; i < A->nrow; ++i) { - cs_mult(&work[i], &work[i], rwork[i]); - } - } else { - /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ - for (i = 0; i < A->nrow; ++i) { - cs_mult(&work[i], &work[i], rwork[i]); - } - - cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) { - cs_mult(&work[i], &work[i], C[i]); - } - else if ( !notran && rowequ ) - for (i = 0; i < A->ncol; ++i) { - cs_mult(&work[i], &work[i], R[i]); - } - } - - } while ( kase != 0 ); - - /* Normalize error. */ - lstres = 0.; - if ( notran && colequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, C[i] * c_abs1( &Xptr[i]) ); - } else if ( !notran && rowequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, R[i] * c_abs1( &Xptr[i]) ); - } else { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, c_abs1( &Xptr[i]) ); - } - if ( lstres != 0. ) - ferr[j] /= lstres; - - } /* for each RHS j ... */ - - SUPERLU_FREE(work); - SUPERLU_FREE(rwork); - SUPERLU_FREE(iwork); - SUPERLU_FREE(Bjcol.Store); - - return; - -} /* cgsrfs */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssv.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,221 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_cdefs.h" - -void -cgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, - SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * CGSSV solves the system of linear equations A*X=B, using the - * LU factorization from CGSTRF. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. Permute the columns of A, forming A*Pc, where Pc - * is a permutation matrix. For more details of this step, - * see sp_preorder.c. - * - * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined - * by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 1.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the - * above algorithm to the transpose of A: - * - * 2.1. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix. - * For more details of this step, see sp_preorder.c. - * - * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr - * determined by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 2.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR; Dtype = SLU_C; Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, column permutation vector of size A->ncol - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * If A->Stype = SLU_NR, column permutation vector of size A->nrow - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or - * options->Fact = SamePattern_SameRowPerm, it is an input argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * Otherwise, it is an output argument. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->RowPerm = MY_PERMR or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument. - * otherwise it is an output argument. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - DNformat *Bstore; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int lwork = 0, *etree, i; - - /* Set default values for some parameters */ - float drop_tol = 0.; - int panel_size; /* panel size */ - int relax; /* no of columns in a relaxed snodes */ - int permc_spec; - trans_t trans = NOTRANS; - double *utime; - double t; /* Temporary time */ - - /* Test the input parameters ... */ - *info = 0; - Bstore = B->Store; - if ( options->Fact != DOFACT ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_C || A->Mtype != SLU_GE ) - *info = -2; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) - *info = -7; - if ( *info != 0 ) { - i = -(*info); - xerbla_("cgssv", &i); - return; - } - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - trans = TRANS; - } else { - if ( A->Stype == SLU_NC ) AA = A; - } - - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t; - - etree = intMalloc(A->ncol); - - t = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t; - - panel_size = sp_ienv(1); - relax = sp_ienv(2); - - /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4));*/ - t = SuperLU_timer_(); - /* Compute the LU factorization of A. */ - cgstrf(options, &AC, drop_tol, relax, panel_size, - etree, NULL, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t; - - t = SuperLU_timer_(); - if ( *info == 0 ) { - /* Solve the system A*X=B, overwriting B with X. */ - cgstrs (trans, L, U, perm_c, perm_r, B, stat, info); - } - utime[SOLVE] = SuperLU_timer_() - t; - - SUPERLU_FREE (etree); - Destroy_CompCol_Permuted(&AC); - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssvx.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssvx.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssvx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,614 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_cdefs.h" - -void -cgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - int *etree, char *equed, float *R, float *C, - SuperMatrix *L, SuperMatrix *U, void *work, int lwork, - SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, - float *rcond, float *ferr, float *berr, - mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * CGSSVX solves the system of linear equations A*X=B or A'*X=B, using - * the LU factorization from cgstrf(). Error bounds on the solution and - * a condition estimate are also provided. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A is - * overwritten by diag(R)*A*diag(C) and B by diag(R)*B - * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans - * = TRANS or CONJ). - * - * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation - * matrix that usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 1.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the matrix A (after equilibration if options->Equil = YES) - * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. - * - * 1.4. Compute the reciprocal pivot growth factor. - * - * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form of - * A is used to estimate the condition number of the matrix A. If - * the reciprocal of the condition number is less than machine - * precision, info = A->ncol+1 is returned as a warning, but the - * routine still goes on to solve for X and computes error bounds - * as described below. - * - * 1.6. The system of equations is solved for X using the factored form - * of A. - * - * 1.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 1.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm - * to the transpose of A: - * - * 2.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A' is - * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B - * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). - * - * 2.2. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix that - * usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 2.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the transpose(A) (after equilibration if - * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the - * permutation Pr determined by partial pivoting. - * - * 2.4. Compute the reciprocal pivot growth factor. - * - * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form - * of transpose(A) is used to estimate the condition number of the - * matrix A. If the reciprocal of the condition number - * is less than machine precision, info = A->nrow+1 is returned as - * a warning, but the routine still goes on to solve for X and - * computes error bounds as described below. - * - * 2.6. The system of equations is solved for X using the factored form - * of transpose(A). - * - * 2.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 2.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input/output) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * On entry, If options->Fact = FACTORED and equed is not 'N', - * then A must have been equilibrated by the scaling factors in - * R and/or C. - * On exit, A is not modified if options->Equil = NO, or if - * options->Equil = YES but equed = 'N' on exit. - * Otherwise, if options->Equil = YES and equed is not 'N', - * A is scaled as follows: - * If A->Stype = SLU_NC: - * equed = 'R': A := diag(R) * A - * equed = 'C': A := A * diag(C) - * equed = 'B': A := diag(R) * A * diag(C). - * If A->Stype = SLU_NR: - * equed = 'R': transpose(A) := diag(R) * transpose(A) - * equed = 'C': transpose(A) := transpose(A) * diag(C) - * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * - * If A->Stype = SLU_NR, column permutation vector of size A->nrow, - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by a - * new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument. - * - * etree (input/output) int*, dimension (A->ncol) - * Elimination tree of Pc'*A'*A*Pc. - * If options->Fact != FACTORED and options->Fact != DOFACT, - * etree is an input argument, otherwise it is an output argument. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * equed (input/output) char* - * Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * If options->Fact = FACTORED, equed is an input argument, - * otherwise it is an output argument. - * - * R (input/output) float*, dimension (A->nrow) - * The row scale factors for A or transpose(A). - * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * If options->Fact = FACTORED, R is an input argument, - * otherwise, R is output. - * If options->zFact = FACTORED and equed = 'R' or 'B', each element - * of R must be positive. - * - * C (input/output) float*, dimension (A->ncol) - * The column scale factors for A or transpose(A). - * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * If options->Fact = FACTORED, C is an input argument, - * otherwise, C is output. - * If options->Fact = FACTORED and equed = 'C' or 'B', each element - * of C must be positive. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU. - * - * work (workspace/output) void*, size (lwork) (in bytes) - * User supplied workspace, should be large enough - * to hold data structures for factors L and U. - * On exit, if fact is not 'F', L and U point to this array. - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * mem_usage->total_needed; no other side effects. - * - * See argument 'mem_usage' for memory usage statistics. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * If B->ncol = 0, only LU decomposition is performed, the triangular - * solve is skipped. - * On exit, - * if equed = 'N', B is not modified; otherwise - * if A->Stype = SLU_NC: - * if options->Trans = NOTRANS and equed = 'R' or 'B', - * B is overwritten by diag(R)*B; - * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', - * B is overwritten by diag(C)*B; - * if A->Stype = SLU_NR: - * if options->Trans = NOTRANS and equed = 'C' or 'B', - * B is overwritten by diag(C)*B; - * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', - * B is overwritten by diag(R)*B. - * - * X (output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. - * If info = 0 or info = A->ncol+1, X contains the solution matrix - * to the original system of equations. Note that A and B are modified - * on exit if equed is not 'N', and the solution to the equilibrated - * system is inv(diag(C))*X if options->Trans = NOTRANS and - * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' - * and equed = 'R' or 'B'. - * - * recip_pivot_growth (output) float* - * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). - * The infinity norm is used. If recip_pivot_growth is much less - * than 1, the stability of the LU factorization could be poor. - * - * rcond (output) float* - * The estimate of the reciprocal condition number of the matrix A - * after equilibration (if done). If rcond is less than the machine - * precision (in particular, if rcond = 0), the matrix is singular - * to working precision. This condition is indicated by a return - * code of info > 0. - * - * FERR (output) float*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * If options->IterRefine = NOREFINE, ferr = 1.0. - * - * BERR (output) float*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * If options->IterRefine = NOREFINE, berr = 1.0. - * - * mem_usage (output) mem_usage_t* - * Record the memory usage statistics, consisting of following fields: - * - for_lu (float) - * The amount of space used in bytes for L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * The number of memory expansions during the LU factorization. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly - * singular, so the solution and error bounds - * could not be computed. - * = A->ncol+1: U is nonsingular, but RCOND is less than machine - * precision, meaning that the matrix is singular to - * working precision. Nevertheless, the solution and - * error bounds are computed because there are a number - * of situations where the computed solution can be more - * accurate than the value of RCOND would suggest. - * > A->ncol+1: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - - DNformat *Bstore, *Xstore; - complex *Bmat, *Xmat; - int ldb, ldx, nrhs; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int colequ, equil, nofact, notran, rowequ, permc_spec; - trans_t trant; - char norm[1]; - int i, j, info1; - float amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; - int relax, panel_size; - float diag_pivot_thresh, drop_tol; - double t0; /* temporary time */ - double *utime; - - /* External functions */ - extern float clangs(char *, SuperMatrix *); - extern double slamch_(char *); - - Bstore = B->Store; - Xstore = X->Store; - Bmat = Bstore->nzval; - Xmat = Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - *info = 0; - nofact = (options->Fact != FACTORED); - equil = (options->Equil == YES); - notran = (options->Trans == NOTRANS); - if ( nofact ) { - *(unsigned char *)equed = 'N'; - rowequ = FALSE; - colequ = FALSE; - } else { - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - smlnum = slamch_("Safe minimum"); - bignum = 1. / smlnum; - } - -#if 0 -printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", - options->Fact, options->Trans, *equed); -#endif - - /* Test the input parameters */ - if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && - options->Fact != SamePattern_SameRowPerm && - !notran && options->Trans != TRANS && options->Trans != CONJ && - !equil && options->Equil != NO) - *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_C || A->Mtype != SLU_GE ) - *info = -2; - else if (options->Fact == FACTORED && - !(rowequ || colequ || lsame_(equed, "N"))) - *info = -6; - else { - if (rowequ) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, R[j]); - rcmax = SUPERLU_MAX(rcmax, R[j]); - } - if (rcmin <= 0.) *info = -7; - else if ( A->nrow > 0) - rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else rowcnd = 1.; - } - if (colequ && *info == 0) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, C[j]); - rcmax = SUPERLU_MAX(rcmax, C[j]); - } - if (rcmin <= 0.) *info = -8; - else if (A->nrow > 0) - colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else colcnd = 1.; - } - if (*info == 0) { - if ( lwork < -1 ) *info = -12; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_C || - B->Mtype != SLU_GE ) - *info = -13; - else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || - (B->ncol != 0 && B->ncol != X->ncol) || - X->Stype != SLU_DN || - X->Dtype != SLU_C || X->Mtype != SLU_GE ) - *info = -14; - } - } - if (*info != 0) { - i = -(*info); - xerbla_("cgssvx", &i); - return; - } - - /* Initialization for factor parameters */ - panel_size = sp_ienv(1); - relax = sp_ienv(2); - diag_pivot_thresh = options->DiagPivotThresh; - drop_tol = 0.0; - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - if ( notran ) { /* Reverse the transpose argument. */ - trant = TRANS; - notran = 0; - } else { - trant = NOTRANS; - notran = 1; - } - } else { /* A->Stype == SLU_NC */ - trant = options->Trans; - AA = A; - } - - if ( nofact && equil ) { - t0 = SuperLU_timer_(); - /* Compute row and column scalings to equilibrate the matrix A. */ - cgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); - - if ( info1 == 0 ) { - /* Equilibrate matrix A. */ - claqgs(AA, R, C, rowcnd, colcnd, amax, equed); - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - } - utime[EQUIL] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Scale the right hand side if equilibration was performed. */ - if ( notran ) { - if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); - } - } - } else if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); - } - } - } - - if ( nofact ) { - - t0 = SuperLU_timer_(); - /* - * Gnet column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t0; - - t0 = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t0; - -/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4)); - fflush(stdout); */ - - /* Compute the LU factorization of A*Pc. */ - t0 = SuperLU_timer_(); - cgstrf(options, &AC, drop_tol, relax, panel_size, - etree, work, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t0; - - if ( lwork == -1 ) { - mem_usage->total_needed = *info - A->ncol; - return; - } - } - - if ( options->PivotGrowth ) { - if ( *info > 0 ) { - if ( *info <= A->ncol ) { - /* Compute the reciprocal pivot growth factor of the leading - rank-deficient *info columns of A. */ - *recip_pivot_growth = cPivotGrowth(*info, AA, perm_c, L, U); - } - return; - } - - /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ - *recip_pivot_growth = cPivotGrowth(A->ncol, AA, perm_c, L, U); - } - - if ( options->ConditionNumber ) { - /* Estimate the reciprocal of the condition number of A. */ - t0 = SuperLU_timer_(); - if ( notran ) { - *(unsigned char *)norm = '1'; - } else { - *(unsigned char *)norm = 'I'; - } - anorm = clangs(norm, AA); - cgscon(norm, L, U, anorm, rcond, stat, info); - utime[RCOND] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Compute the solution matrix X. */ - for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ - for (i = 0; i < B->nrow; i++) - Xmat[i + j*ldx] = Bmat[i + j*ldb]; - - t0 = SuperLU_timer_(); - cgstrs (trant, L, U, perm_c, perm_r, X, stat, info); - utime[SOLVE] = SuperLU_timer_() - t0; - - /* Use iterative refinement to improve the computed solution and compute - error bounds and backward error estimates for it. */ - t0 = SuperLU_timer_(); - if ( options->IterRefine != NOREFINE ) { - cgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, - X, ferr, berr, stat, info); - } else { - for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; - } - utime[REFINE] = SuperLU_timer_() - t0; - - /* Transform the solution matrix X to a solution of the original system. */ - if ( notran ) { - if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); - } - } - } else if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); - } - } - } /* end if nrhs > 0 */ - - if ( options->ConditionNumber ) { - /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ - if ( *rcond < slamch_("E") ) *info = A->ncol + 1; - } - - if ( nofact ) { - cQuerySpace(L, U, mem_usage); - Destroy_CompCol_Permuted(&AC); - } - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrf.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,433 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - -void -cgstrf (superlu_options_t *options, SuperMatrix *A, float drop_tol, - int relax, int panel_size, int *etree, void *work, int lwork, - int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * CGSTRF computes an LU factorization of a general sparse m-by-n - * matrix A using partial pivoting with row interchanges. - * The factorization has the form - * Pr * A = L * U - * where Pr is a row permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper - * triangular (upper trapezoidal if A->nrow < A->ncol). - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_C; Mtype = SLU_GE. - * - * drop_tol (input) float (NOT IMPLEMENTED) - * Drop tolerance parameter. At step j of the Gaussian elimination, - * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. - * 0 <= drop_tol <= 1. The default value of drop_tol is 0. - * - * relax (input) int - * To control degree of relaxing supernodes. If the number - * of nodes (columns) in a subtree of the elimination tree is less - * than relax, this subtree is considered as one supernode, - * regardless of the row structures of those columns. - * - * panel_size (input) int - * A panel consists of at most panel_size consecutive columns. - * - * etree (input) int*, dimension (A->ncol) - * Elimination tree of A'*A. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * On input, the columns of A should be permuted so that the - * etree is in a certain postorder. - * - * work (input/output) void*, size (lwork) (in bytes) - * User-supplied work space and space for the output data structures. - * Not referenced if lwork = 0; - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * *info; no other side effects. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * When searching for diagonal, perm_c[*] is applied to the - * row subscripts of A, so that diagonal threshold pivoting - * can find the diagonal of A, rather than that of A*Pc. - * - * perm_r (input/output) int*, dimension (A->nrow) - * Row permutation vector which defines the permutation matrix Pr, - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by - * a new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument; - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = SLU_NC, - * Dtype = SLU_C, Mtype = SLU_TRU. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. If lwork = -1, it is - * the estimated amount of space needed, plus A->ncol. - * - * ====================================================================== - * - * Local Working Arrays: - * ====================== - * m = number of rows in the matrix - * n = number of columns in the matrix - * - * xprune[0:n-1]: xprune[*] points to locations in subscript - * vector lsub[*]. For column i, xprune[i] denotes the point where - * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need - * to be traversed for symbolic factorization. - * - * marker[0:3*m-1]: marker[i] = j means that node i has been - * reached when working on column j. - * Storage: relative to original row subscripts - * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, - * see cpanel_dfs.c; marker2 is used for inner-factorization, - * see ccolumn_dfs.c. - * - * parent[0:m-1]: parent vector used during dfs - * Storage: relative to new row subscripts - * - * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) - * unexplored neighbor of i in lsub[*] - * - * segrep[0:nseg-1]: contains the list of supernodal representatives - * in topological order of the dfs. A supernode representative is the - * last column of a supernode. - * The maximum size of segrep[] is n. - * - * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a - * supernodal representative r, repfnz[r] is the location of the first - * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 - * indicates the supernode r has been explored. - * NOTE: There are W of them, each used for one column of a panel. - * - * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below - * the panel diagonal. These are filled in during cpanel_dfs(), and are - * used later in the inner LU factorization within the panel. - * panel_lsub[]/dense[] pair forms the SPA data structure. - * NOTE: There are W of them. - * - * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; - * NOTE: there are W of them. - * - * tempv[0:*]: real temporary used for dense numeric kernels; - * The size of this array is defined by NUM_TEMPV() in csp_defs.h. - * - */ - /* Local working arrays */ - NCPformat *Astore; - int *iperm_r = NULL; /* inverse of perm_r; used when - options->Fact == SamePattern_SameRowPerm */ - int *iperm_c; /* inverse of perm_c */ - int *iwork; - complex *cwork; - int *segrep, *repfnz, *parent, *xplore; - int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ - int *xprune; - int *marker; - complex *dense, *tempv; - int *relax_end; - complex *a; - int *asub; - int *xa_begin, *xa_end; - int *xsup, *supno; - int *xlsub, *xlusup, *xusub; - int nzlumax; - static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ - - /* Local scalars */ - fact_t fact = options->Fact; - double diag_pivot_thresh = options->DiagPivotThresh; - int pivrow; /* pivotal row number in the original matrix A */ - int nseg1; /* no of segments in U-column above panel row jcol */ - int nseg; /* no of segments in each U-column */ - register int jcol; - register int kcol; /* end column of a relaxed snode */ - register int icol; - register int i, k, jj, new_next, iinfo; - int m, n, min_mn, jsupno, fsupc, nextlu, nextu; - int w_def; /* upper bound on panel width */ - int usepr, iperm_r_allocated = 0; - int nnzL, nnzU; - int *panel_histo = stat->panel_histo; - flops_t *ops = stat->ops; - - iinfo = 0; - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN(m, n); - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - - /* Allocate storage common to the factor routines */ - *info = cLUMemInit(fact, work, lwork, m, n, Astore->nnz, - panel_size, L, U, &Glu, &iwork, &cwork); - if ( *info ) return; - - xsup = Glu.xsup; - supno = Glu.supno; - xlsub = Glu.xlsub; - xlusup = Glu.xlusup; - xusub = Glu.xusub; - - SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, - &repfnz, &panel_lsub, &xprune, &marker); - cSetRWork(m, panel_size, cwork, &dense, &tempv); - - usepr = (fact == SamePattern_SameRowPerm); - if ( usepr ) { - /* Compute the inverse of perm_r */ - iperm_r = (int *) intMalloc(m); - for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; - iperm_r_allocated = 1; - } - iperm_c = (int *) intMalloc(n); - for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; - - /* Identify relaxed snodes */ - relax_end = (int *) intMalloc(n); - if ( options->SymmetricMode == YES ) { - heap_relax_snode(n, etree, relax, marker, relax_end); - } else { - relax_snode(n, etree, relax, marker, relax_end); - } - - ifill (perm_r, m, EMPTY); - ifill (marker, m * NO_MARKER, EMPTY); - supno[0] = -1; - xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; - w_def = panel_size; - - /* - * Work on one "panel" at a time. A panel is one of the following: - * (a) a relaxed supernode at the bottom of the etree, or - * (b) panel_size contiguous columns, defined by the user - */ - for (jcol = 0; jcol < min_mn; ) { - - if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ - kcol = relax_end[jcol]; /* end of the relaxed snode */ - panel_histo[kcol-jcol+1]++; - - /* -------------------------------------- - * Factorize the relaxed supernode(jcol:kcol) - * -------------------------------------- */ - /* Determine the union of the row structure of the snode */ - if ( (*info = csnode_dfs(jcol, kcol, asub, xa_begin, xa_end, - xprune, marker, &Glu)) != 0 ) - return; - - nextu = xusub[jcol]; - nextlu = xlusup[jcol]; - jsupno = supno[jcol]; - fsupc = xsup[jsupno]; - new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); - nzlumax = Glu.nzlumax; - while ( new_next > nzlumax ) { - if ( (*info = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) - return; - } - - for (icol = jcol; icol<= kcol; icol++) { - xusub[icol+1] = nextu; - - /* Scatter into SPA dense[*] */ - for (k = xa_begin[icol]; k < xa_end[icol]; k++) - dense[asub[k]] = a[k]; - - /* Numeric update within the snode */ - csnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); - - if ( (*info = cpivotL(icol, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - -#ifdef DEBUG - cprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); -#endif - - } - - jcol = icol; - - } else { /* Work on one panel of panel_size columns */ - - /* Adjust panel_size so that a panel won't overlap with the next - * relaxed snode. - */ - panel_size = w_def; - for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) - if ( relax_end[k] != EMPTY ) { - panel_size = k - jcol; - break; - } - if ( k == min_mn ) panel_size = min_mn - jcol; - panel_histo[panel_size]++; - - /* symbolic factor on a panel of columns */ - cpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, - dense, panel_lsub, segrep, repfnz, xprune, - marker, parent, xplore, &Glu); - - /* numeric sup-panel updates in topological order */ - cpanel_bmod(m, panel_size, jcol, nseg1, dense, - tempv, segrep, repfnz, &Glu, stat); - - /* Sparse LU within the panel, and below panel diagonal */ - for ( jj = jcol; jj < jcol + panel_size; jj++) { - k = (jj - jcol) * m; /* column index for w-wide arrays */ - - nseg = nseg1; /* Begin after all the panel segments */ - - if ((*info = ccolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], - segrep, &repfnz[k], xprune, marker, - parent, xplore, &Glu)) != 0) return; - - /* Numeric updates */ - if ((*info = ccolumn_bmod(jj, (nseg - nseg1), &dense[k], - tempv, &segrep[nseg1], &repfnz[k], - jcol, &Glu, stat)) != 0) return; - - /* Copy the U-segments to ucol[*] */ - if ((*info = ccopy_to_ucol(jj, nseg, segrep, &repfnz[k], - perm_r, &dense[k], &Glu)) != 0) - return; - - if ( (*info = cpivotL(jj, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - - /* Prune columns (0:jj-1) using column jj */ - cpruneL(jj, perm_r, pivrow, nseg, segrep, - &repfnz[k], xprune, &Glu); - - /* Reset repfnz[] for this column */ - resetrep_col (nseg, segrep, &repfnz[k]); - -#ifdef DEBUG - cprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); -#endif - - } - - jcol += panel_size; /* Move to the next panel */ - - } /* else */ - - } /* for */ - - *info = iinfo; - - if ( m > n ) { - k = 0; - for (i = 0; i < m; ++i) - if ( perm_r[i] == EMPTY ) { - perm_r[i] = n + k; - ++k; - } - } - - countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); - fixupL(min_mn, perm_r, &Glu); - - cLUWorkFree(iwork, cwork, &Glu); /* Free work space and compress storage */ - - if ( fact == SamePattern_SameRowPerm ) { - /* L and U structures may have changed due to possibly different - pivoting, even though the storage is available. - There could also be memory expansions, so the array locations - may have changed, */ - ((SCformat *)L->Store)->nnz = nnzL; - ((SCformat *)L->Store)->nsuper = Glu.supno[n]; - ((SCformat *)L->Store)->nzval = Glu.lusup; - ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; - ((SCformat *)L->Store)->rowind = Glu.lsub; - ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; - ((NCformat *)U->Store)->nnz = nnzU; - ((NCformat *)U->Store)->nzval = Glu.ucol; - ((NCformat *)U->Store)->rowind = Glu.usub; - ((NCformat *)U->Store)->colptr = Glu.xusub; - } else { - cCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, - Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, - Glu.xsup, SLU_SC, SLU_C, SLU_TRLU); - cCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, - Glu.usub, Glu.xusub, SLU_NC, SLU_C, SLU_TRU); - } - - ops[FACT] += ops[TRSV] + ops[GEMV]; - - if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); - SUPERLU_FREE (iperm_c); - SUPERLU_FREE (relax_end); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,345 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - - -/* - * Function prototypes - */ -void cusolve(int, int, complex*, complex*); -void clsolve(int, int, complex*, complex*); -void cmatvec(int, int, int, complex*, complex*, complex*); - - -void -cgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, SuperMatrix *B, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * CGSTRS solves a system of linear equations A*X=B or A'*X=B - * with A sparse and B dense, using the LU factorization computed by - * CGSTRF. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U as computed by - * cgstrf(). Use compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * cgstrf(). Use column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (L->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (L->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ -#ifdef _CRAY - _fcd ftcs1, ftcs2, ftcs3, ftcs4; -#endif - int incx = 1, incy = 1; -#ifdef USE_VENDOR_BLAS - complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; - complex *work_col; -#endif - complex temp_comp; - DNformat *Bstore; - complex *Bmat; - SCformat *Lstore; - NCformat *Ustore; - complex *Lval, *Uval; - int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; - int i, j, k, iptr, jcol, n, ldb, nrhs; - complex *work, *rhs_work, *soln; - flops_t solve_ops; - void cprint_soln(); - - /* Test input parameters ... */ - *info = 0; - Bstore = B->Store; - ldb = Bstore->lda; - nrhs = B->ncol; - if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) - *info = -2; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) - *info = -3; - else if ( ldb < SUPERLU_MAX(0, L->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) - *info = -6; - if ( *info ) { - i = -(*info); - xerbla_("cgstrs", &i); - return; - } - - n = L->nrow; - work = complexCalloc(n * nrhs); - if ( !work ) ABORT("Malloc fails for local work[]."); - soln = complexMalloc(n); - if ( !soln ) ABORT("Malloc fails for local soln[]."); - - Bmat = Bstore->nzval; - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( trans == NOTRANS ) { - /* Permute right hand sides to form Pr*B */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - /* Forward solve PLy=Pb. */ - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - nrow = nsupr - nsupc; - - solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; - solve_ops += 8 * nrow * nsupc * nrhs; - - if ( nsupc == 1 ) { - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - luptr = L_NZ_START(fsupc); - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ - irow = L_SUB(iptr); - ++luptr; - cc_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); - c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); - } - } - } else { - luptr = L_NZ_START(fsupc); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); - CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#else - ctrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - cgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#endif - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - work_col = &work[j*n]; - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - c_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); - work_col[i].r = 0.0; - work_col[i].i = 0.0; - iptr++; - } - } -#else - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - clsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); - cmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], - &rhs_work[fsupc], &work[0] ); - - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - c_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); - work[i].r = 0.; - work[i].i = 0.; - iptr++; - } - } -#endif - } /* else ... */ - } /* for L-solve */ - -#ifdef DEBUG - printf("After L-solve: y=\n"); - cprint_soln(n, nrhs, Bmat); -#endif - - /* - * Back solve Ux=y. - */ - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; - - if ( nsupc == 1 ) { - rhs_work = &Bmat[0]; - for (j = 0; j < nrhs; j++) { - c_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); - rhs_work += ldb; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("U", strlen("U")); - ftcs3 = _cptofcd("N", strlen("N")); - CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#else - ctrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#endif -#else - for (j = 0; j < nrhs; j++) - cusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); -#endif - } - - for (j = 0; j < nrhs; ++j) { - rhs_work = &Bmat[j*ldb]; - for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ - irow = U_SUB(i); - cc_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); - c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); - } - } - } - - } /* for U-solve */ - -#ifdef DEBUG - printf("After U-solve: x=\n"); - cprint_soln(n, nrhs, Bmat); -#endif - - /* Compute the final solution X := Pc*X. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = solve_ops; - - } else { /* Solve A'*X=B or CONJ(A)*X=B */ - /* Permute right hand sides to form Pc'*B. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = 0; - if (trans == TRANS) { - for (k = 0; k < nrhs; ++k) { - /* Multiply by inv(U'). */ - sp_ctrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); - - /* Multiply by inv(L'). */ - sp_ctrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); - } - } else { /* trans == CONJ */ - for (k = 0; k < nrhs; ++k) { - /* Multiply by conj(inv(U')). */ - sp_ctrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); - - /* Multiply by conj(inv(L')). */ - sp_ctrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); - } - } - /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - } - - SUPERLU_FREE(work); - SUPERLU_FREE(soln); -} - -/* - * Diagnostic print of the solution vector - */ -void -cprint_soln(int n, int nrhs, complex *soln) -{ - int i; - - for (i = 0; i < n; i++) - printf("\t%d: %.4f\n", i, soln[i]); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clacon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clacon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clacon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,213 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include "slu_Cnames.h" -#include "slu_scomplex.h" - -int -clacon_(int *n, complex *v, complex *x, float *est, int *kase) - -{ -/* - Purpose - ======= - - CLACON estimates the 1-norm of a square matrix A. - Reverse communication is used for evaluating matrix-vector products. - - - Arguments - ========= - - N (input) INT - The order of the matrix. N >= 1. - - V (workspace) COMPLEX PRECISION array, dimension (N) - On the final return, V = A*W, where EST = norm(V)/norm(W) - (W is not returned). - - X (input/output) COMPLEX PRECISION array, dimension (N) - On an intermediate return, X should be overwritten by - A * X, if KASE=1, - A' * X, if KASE=2, - where A' is the conjugate transpose of A, - and CLACON must be re-called with all the other parameters - unchanged. - - - EST (output) FLOAT PRECISION - An estimate (a lower bound) for norm(A). - - KASE (input/output) INT - On the initial call to CLACON, KASE should be 0. - On an intermediate return, KASE will be 1 or 2, indicating - whether X should be overwritten by A * X or A' * X. - On the final return from CLACON, KASE will again be 0. - - Further Details - ======= ======= - - Contributed by Nick Higham, University of Manchester. - Originally named CONEST, dated March 16, 1988. - - Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of - a real or complex matrix, with applications to condition estimation", - ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. - ===================================================================== -*/ - - /* Table of constant values */ - int c__1 = 1; - complex zero = {0.0, 0.0}; - complex one = {1.0, 0.0}; - - /* System generated locals */ - float d__1; - - /* Local variables */ - static int iter; - static int jump, jlast; - static float altsgn, estold; - static int i, j; - float temp; - float safmin; - extern double slamch_(char *); - extern int icmax1_(int *, complex *, int *); - extern double scsum1_(int *, complex *, int *); - - safmin = slamch_("Safe minimum"); - if ( *kase == 0 ) { - for (i = 0; i < *n; ++i) { - x[i].r = 1. / (float) (*n); - x[i].i = 0.; - } - *kase = 1; - jump = 1; - return 0; - } - - switch (jump) { - case 1: goto L20; - case 2: goto L40; - case 3: goto L70; - case 4: goto L110; - case 5: goto L140; - } - - /* ................ ENTRY (JUMP = 1) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ - L20: - if (*n == 1) { - v[0] = x[0]; - *est = c_abs(&v[0]); - /* ... QUIT */ - goto L150; - } - *est = scsum1_(n, x, &c__1); - - for (i = 0; i < *n; ++i) { - d__1 = c_abs(&x[i]); - if (d__1 > safmin) { - d__1 = 1 / d__1; - x[i].r *= d__1; - x[i].i *= d__1; - } else { - x[i] = one; - } - } - *kase = 2; - jump = 2; - return 0; - - /* ................ ENTRY (JUMP = 2) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ -L40: - j = icmax1_(n, &x[0], &c__1); - --j; - iter = 2; - - /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ -L50: - for (i = 0; i < *n; ++i) x[i] = zero; - x[j] = one; - *kase = 1; - jump = 3; - return 0; - - /* ................ ENTRY (JUMP = 3) - X HAS BEEN OVERWRITTEN BY A*X. */ -L70: -#ifdef _CRAY - CCOPY(n, x, &c__1, v, &c__1); -#else - ccopy_(n, x, &c__1, v, &c__1); -#endif - estold = *est; - *est = scsum1_(n, v, &c__1); - - -L90: - /* TEST FOR CYCLING. */ - if (*est <= estold) goto L120; - - for (i = 0; i < *n; ++i) { - d__1 = c_abs(&x[i]); - if (d__1 > safmin) { - d__1 = 1 / d__1; - x[i].r *= d__1; - x[i].i *= d__1; - } else { - x[i] = one; - } - } - *kase = 2; - jump = 4; - return 0; - - /* ................ ENTRY (JUMP = 4) - X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ -L110: - jlast = j; - j = icmax1_(n, &x[0], &c__1); - --j; - if (x[jlast].r != (d__1 = x[j].r, fabs(d__1)) && iter < 5) { - ++iter; - goto L50; - } - - /* ITERATION COMPLETE. FINAL STAGE. */ -L120: - altsgn = 1.; - for (i = 1; i <= *n; ++i) { - x[i-1].r = altsgn * ((float)(i - 1) / (float)(*n - 1) + 1.); - x[i-1].i = 0.; - altsgn = -altsgn; - } - *kase = 1; - jump = 5; - return 0; - - /* ................ ENTRY (JUMP = 5) - X HAS BEEN OVERWRITTEN BY A*X. */ -L140: - temp = scsum1_(n, x, &c__1) / (float)(*n * 3) * 2.; - if (temp > *est) { -#ifdef _CRAY - CCOPY(n, &x[0], &c__1, &v[0], &c__1); -#else - ccopy_(n, &x[0], &c__1, &v[0], &c__1); -#endif - *est = temp; - } - -L150: - *kase = 0; - return 0; - -} /* clacon_ */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clangs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,110 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: clangs.c - * History: Modified from lapack routine CLANGE - */ -#include -#include "slu_cdefs.h" - -float clangs(char *norm, SuperMatrix *A) -{ -/* - Purpose - ======= - - CLANGS returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - CLANGE returns the value - - CLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in CLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - complex *Aval; - int i, j, irow; - float value, sum; - float *rwork; - - Astore = A->Store; - Aval = Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - value = SUPERLU_MAX( value, c_abs( &Aval[i]) ); - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += c_abs( &Aval[i] ); - value = SUPERLU_MAX(value,sum); - } - - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) ) - ABORT("SUPERLU_MALLOC fails for rwork."); - for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { - irow = Astore->rowind[i]; - rwork[irow] += c_abs( &Aval[i] ); - } - value = 0.; - for (i = 0; i < A->nrow; ++i) - value = SUPERLU_MAX(value, rwork[i]); - - SUPERLU_FREE (rwork); - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else - ABORT("Illegal norm specified."); - - return (value); - -} /* clangs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/claqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/claqgs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/claqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/claqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,138 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: claqgs.c - * History: Modified from LAPACK routine CLAQGE - */ -#include -#include "slu_cdefs.h" - -void -claqgs(SuperMatrix *A, float *r, float *c, - float rowcnd, float colcnd, float amax, char *equed) -{ -/* - Purpose - ======= - - CLAQGS equilibrates a general sparse M by N matrix A using the row and - scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = NC; Dtype = SLU_C; Mtype = GE. - - R (input) float*, dimension (A->nrow) - The row scale factors for A. - - C (input) float*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) float - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) float - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) float - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NCformat *Astore; - complex *Aval; - int i, j, irow; - float large, small, cj; - extern double slamch_(char *); - float temp; - - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - - /* Initialize LARGE and SMALL. */ - small = slamch_("Safe minimum") / slamch_("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - cs_mult(&Aval[i], &Aval[i], cj); - } - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - cs_mult(&Aval[i], &Aval[i], r[irow]); - } - *(unsigned char *)equed = 'R'; - } else { - /* Row and column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - temp = cj * r[irow]; - cs_mult(&Aval[i], &Aval[i], temp); - } - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* claqgs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmemory.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmemory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmemory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,680 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_cdefs.h" - -/* Constants */ -#define NO_MEMTYPE 4 /* 0: lusup; - 1: ucol; - 2: lsub; - 3: usub */ -#define GluIntArray(n) (5 * (n) + 5) - -/* Internal prototypes */ -void *cexpand (int *, MemType,int, int, GlobalLU_t *); -int cLUWorkInit (int, int, int, int **, complex **, LU_space_t); -void copy_mem_complex (int, void *, void *); -void cStackCompress (GlobalLU_t *); -void cSetupSpace (void *, int, LU_space_t *); -void *cuser_malloc (int, int); -void cuser_free (int, int); - -/* External prototypes (in memory.c - prec-indep) */ -extern void copy_mem_int (int, void *, void *); -extern void user_bcopy (char *, char *, int); - -/* Headers for 4 types of dynamatically managed memory */ -typedef struct e_node { - int size; /* length of the memory that has been used */ - void *mem; /* pointer to the new malloc'd store */ -} ExpHeader; - -typedef struct { - int size; - int used; - int top1; /* grow upward, relative to &array[0] */ - int top2; /* grow downward */ - void *array; -} LU_stack_t; - -/* Variables local to this file */ -static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */ -static LU_stack_t stack; -static int no_expand; - -/* Macros to manipulate stack */ -#define StackFull(x) ( x + stack.used >= stack.size ) -#define NotDoubleAlign(addr) ( (long int)addr & 7 ) -#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) -#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ - (w + 1) * m * sizeof(complex) ) -#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ - - - - -/* - * Setup the memory model to be used for factorization. - * lwork = 0: use system malloc; - * lwork > 0: use user-supplied work[] space. - */ -void cSetupSpace(void *work, int lwork, LU_space_t *MemModel) -{ - if ( lwork == 0 ) { - *MemModel = SYSTEM; /* malloc/free */ - } else if ( lwork > 0 ) { - *MemModel = USER; /* user provided space */ - stack.used = 0; - stack.top1 = 0; - stack.top2 = (lwork/4)*4; /* must be word addressable */ - stack.size = stack.top2; - stack.array = (void *) work; - } -} - - - -void *cuser_malloc(int bytes, int which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - - -void cuser_free(int bytes, int which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int cQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) -{ - SCformat *Lstore; - NCformat *Ustore; - register int n, iword, dword, panel_size = sp_ienv(1); - - Lstore = L->Store; - Ustore = U->Store; - n = L->ncol; - iword = sizeof(int); - dword = sizeof(complex); - - /* For LU factors */ - mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] * - dword + Lstore->rowind_colptr[n] * iword ); - mem_usage->for_lu += (float)( (n + 1) * iword + - Ustore->colptr[n] * (dword + iword) ); - - /* Working storage to support factorization */ - mem_usage->total_needed = mem_usage->for_lu + - (float)( (2 * panel_size + 4 + NO_MARKER) * n * iword + - (panel_size + 1) * n * dword ); - - mem_usage->expansions = --no_expand; - - return 0; -} /* cQuerySpace */ - -/* - * Allocate storage for the data structures common to all factor routines. - * For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * If lwork = -1, return the estimated amount of space required, plus n; - * otherwise, return the amount of space actually allocated when - * memory allocation failure occurred. - */ -int -cLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, - int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, - int **iwork, complex **dwork) -{ - int info, iword, dword; - SCformat *Lstore; - NCformat *Ustore; - int *xsup, *supno; - int *lsub, *xlsub; - complex *lusup; - int *xlusup; - complex *ucol; - int *usub, *xusub; - int nzlmax, nzumax, nzlumax; - int FILL = sp_ienv(6); - - Glu->n = n; - no_expand = 0; - iword = sizeof(int); - dword = sizeof(complex); - - if ( !expanders ) - expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader)); - if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); - - if ( fact != SamePattern_SameRowPerm ) { - /* Guess for L\U factors */ - nzumax = nzlumax = FILL * annz; - nzlmax = SUPERLU_MAX(1, FILL/4.) * annz; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else { - cSetupSpace(work, lwork, &Glu->MemModel); - } - -#if ( PRNTlevel >= 1 ) - printf("cLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", - FILL, nzlmax, nzumax); - fflush(stdout); -#endif - - /* Integer pointers for L\U factors */ - if ( Glu->MemModel == SYSTEM ) { - xsup = intMalloc(n+1); - supno = intMalloc(n+1); - xlsub = intMalloc(n+1); - xlusup = intMalloc(n+1); - xusub = intMalloc(n+1); - } else { - xsup = (int *)cuser_malloc((n+1) * iword, HEAD); - supno = (int *)cuser_malloc((n+1) * iword, HEAD); - xlsub = (int *)cuser_malloc((n+1) * iword, HEAD); - xlusup = (int *)cuser_malloc((n+1) * iword, HEAD); - xusub = (int *)cuser_malloc((n+1) * iword, HEAD); - } - - lusup = (complex *) cexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (complex *) cexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) cexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) cexpand( &nzumax, USUB, 0, 1, Glu ); - - while ( !lusup || !ucol || !lsub || !usub ) { - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE(lusup); - SUPERLU_FREE(ucol); - SUPERLU_FREE(lsub); - SUPERLU_FREE(usub); - } else { - cuser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD); - } - nzlumax /= 2; - nzumax /= 2; - nzlmax /= 2; - if ( nzlumax < annz ) { - printf("Not enough memory to perform factorization.\n"); - return (cmemory_usage(nzlmax, nzumax, nzlumax, n) + n); - } -#if ( PRNTlevel >= 1) - printf("cLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", - nzlmax, nzumax); - fflush(stdout); -#endif - lusup = (complex *) cexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (complex *) cexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) cexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) cexpand( &nzumax, USUB, 0, 1, Glu ); - } - - } else { - /* fact == SamePattern_SameRowPerm */ - Lstore = L->Store; - Ustore = U->Store; - xsup = Lstore->sup_to_col; - supno = Lstore->col_to_sup; - xlsub = Lstore->rowind_colptr; - xlusup = Lstore->nzval_colptr; - xusub = Ustore->colptr; - nzlmax = Glu->nzlmax; /* max from previous factorization */ - nzumax = Glu->nzumax; - nzlumax = Glu->nzlumax; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else if ( lwork == 0 ) { - Glu->MemModel = SYSTEM; - } else { - Glu->MemModel = USER; - stack.top2 = (lwork/4)*4; /* must be word-addressable */ - stack.size = stack.top2; - } - - lsub = expanders[LSUB].mem = Lstore->rowind; - lusup = expanders[LUSUP].mem = Lstore->nzval; - usub = expanders[USUB].mem = Ustore->rowind; - ucol = expanders[UCOL].mem = Ustore->nzval;; - expanders[LSUB].size = nzlmax; - expanders[LUSUP].size = nzlumax; - expanders[USUB].size = nzumax; - expanders[UCOL].size = nzumax; - } - - Glu->xsup = xsup; - Glu->supno = supno; - Glu->lsub = lsub; - Glu->xlsub = xlsub; - Glu->lusup = lusup; - Glu->xlusup = xlusup; - Glu->ucol = ucol; - Glu->usub = usub; - Glu->xusub = xusub; - Glu->nzlmax = nzlmax; - Glu->nzumax = nzumax; - Glu->nzlumax = nzlumax; - - info = cLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel); - if ( info ) - return ( info + cmemory_usage(nzlmax, nzumax, nzlumax, n) + n); - - ++no_expand; - return 0; - -} /* cLUMemInit */ - -/* Allocate known working storage. Returns 0 if success, otherwise - returns the number of bytes allocated so far when failure occurred. */ -int -cLUWorkInit(int m, int n, int panel_size, int **iworkptr, - complex **dworkptr, LU_space_t MemModel) -{ - int isize, dsize, extra; - complex *old_ptr; - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - - isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); - dsize = (m * panel_size + - NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(complex); - - if ( MemModel == SYSTEM ) - *iworkptr = (int *) intCalloc(isize/sizeof(int)); - else - *iworkptr = (int *) cuser_malloc(isize, TAIL); - if ( ! *iworkptr ) { - fprintf(stderr, "cLUWorkInit: malloc fails for local iworkptr[]\n"); - return (isize + n); - } - - if ( MemModel == SYSTEM ) - *dworkptr = (complex *) SUPERLU_MALLOC(dsize); - else { - *dworkptr = (complex *) cuser_malloc(dsize, TAIL); - if ( NotDoubleAlign(*dworkptr) ) { - old_ptr = *dworkptr; - *dworkptr = (complex*) DoubleAlign(*dworkptr); - *dworkptr = (complex*) ((double*)*dworkptr - 1); - extra = (char*)old_ptr - (char*)*dworkptr; -#ifdef DEBUG - printf("cLUWorkInit: not aligned, extra %d\n", extra); -#endif - stack.top2 -= extra; - stack.used += extra; - } - } - if ( ! *dworkptr ) { - fprintf(stderr, "malloc fails for local dworkptr[]."); - return (isize + dsize + n); - } - - return 0; -} - - -/* - * Set up pointers for real working arrays. - */ -void -cSetRWork(int m, int panel_size, complex *dworkptr, - complex **dense, complex **tempv) -{ - complex zero = {0.0, 0.0}; - - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - *dense = dworkptr; - *tempv = *dense + panel_size*m; - cfill (*dense, m * panel_size, zero); - cfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); -} - -/* - * Free the working storage used by factor routines. - */ -void cLUWorkFree(int *iwork, complex *dwork, GlobalLU_t *Glu) -{ - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE (iwork); - SUPERLU_FREE (dwork); - } else { - stack.used -= (stack.size - stack.top2); - stack.top2 = stack.size; -/* cStackCompress(Glu); */ - } - - SUPERLU_FREE (expanders); - expanders = 0; -} - -/* Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -cLUMemXpand(int jcol, - int next, /* number of elements currently in the factors */ - MemType mem_type, /* which type of memory to expand */ - int *maxlen, /* modified - maximum length of a data structure */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - void *new_mem; - -#ifdef DEBUG - printf("cLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", - jcol, next, *maxlen, mem_type); -#endif - - if (mem_type == USUB) - new_mem = cexpand(maxlen, mem_type, next, 1, Glu); - else - new_mem = cexpand(maxlen, mem_type, next, 0, Glu); - - if ( !new_mem ) { - int nzlmax = Glu->nzlmax; - int nzumax = Glu->nzumax; - int nzlumax = Glu->nzlumax; - fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); - return (cmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); - } - - switch ( mem_type ) { - case LUSUP: - Glu->lusup = (complex *) new_mem; - Glu->nzlumax = *maxlen; - break; - case UCOL: - Glu->ucol = (complex *) new_mem; - Glu->nzumax = *maxlen; - break; - case LSUB: - Glu->lsub = (int *) new_mem; - Glu->nzlmax = *maxlen; - break; - case USUB: - Glu->usub = (int *) new_mem; - Glu->nzumax = *maxlen; - break; - } - - return 0; - -} - - - -void -copy_mem_complex(int howmany, void *old, void *new) -{ - register int i; - complex *dold = old; - complex *dnew = new; - for (i = 0; i < howmany; i++) dnew[i] = dold[i]; -} - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -void -*cexpand ( - int *prev_len, /* length used from previous call */ - MemType type, /* which part of the memory to expand */ - int len_to_copy, /* size of the memory to be copied to new store */ - int keep_prev, /* = 1: use prev_len; - = 0: compute new_len to expand */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - float EXPAND = 1.5; - float alpha; - void *new_mem, *old_mem; - int new_len, tries, lword, extra, bytes_to_copy; - - alpha = EXPAND; - - if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ - new_len = *prev_len; - else { - new_len = alpha * *prev_len; - } - - if ( type == LSUB || type == USUB ) lword = sizeof(int); - else lword = sizeof(complex); - - if ( Glu->MemModel == SYSTEM ) { - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - if ( no_expand != 0 ) { - tries = 0; - if ( keep_prev ) { - if ( !new_mem ) return (NULL); - } else { - while ( !new_mem ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - } - } - if ( type == LSUB || type == USUB ) { - copy_mem_int(len_to_copy, expanders[type].mem, new_mem); - } else { - copy_mem_complex(len_to_copy, expanders[type].mem, new_mem); - } - SUPERLU_FREE (expanders[type].mem); - } - expanders[type].mem = (void *) new_mem; - - } else { /* MemModel == USER */ - if ( no_expand == 0 ) { - new_mem = cuser_malloc(new_len * lword, HEAD); - if ( NotDoubleAlign(new_mem) && - (type == LUSUP || type == UCOL) ) { - old_mem = new_mem; - new_mem = (void *)DoubleAlign(new_mem); - extra = (char*)new_mem - (char*)old_mem; -#ifdef DEBUG - printf("expand(): not aligned, extra %d\n", extra); -#endif - stack.top1 += extra; - stack.used += extra; - } - expanders[type].mem = (void *) new_mem; - } - else { - tries = 0; - extra = (new_len - *prev_len) * lword; - if ( keep_prev ) { - if ( StackFull(extra) ) return (NULL); - } else { - while ( StackFull(extra) ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - extra = (new_len - *prev_len) * lword; - } - } - - if ( type != USUB ) { - new_mem = (void*)((char*)expanders[type + 1].mem + extra); - bytes_to_copy = (char*)stack.array + stack.top1 - - (char*)expanders[type + 1].mem; - user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); - - if ( type < USUB ) { - Glu->usub = expanders[USUB].mem = - (void*)((char*)expanders[USUB].mem + extra); - } - if ( type < LSUB ) { - Glu->lsub = expanders[LSUB].mem = - (void*)((char*)expanders[LSUB].mem + extra); - } - if ( type < UCOL ) { - Glu->ucol = expanders[UCOL].mem = - (void*)((char*)expanders[UCOL].mem + extra); - } - stack.top1 += extra; - stack.used += extra; - if ( type == UCOL ) { - stack.top1 += extra; /* Add same amount for USUB */ - stack.used += extra; - } - - } /* if ... */ - - } /* else ... */ - } - - expanders[type].size = new_len; - *prev_len = new_len; - if ( no_expand ) ++no_expand; - - return (void *) expanders[type].mem; - -} /* cexpand */ - - -/* - * Compress the work[] array to remove fragmentation. - */ -void -cStackCompress(GlobalLU_t *Glu) -{ - register int iword, dword, ndim; - char *last, *fragment; - int *ifrom, *ito; - complex *dfrom, *dto; - int *xlsub, *lsub, *xusub, *usub, *xlusup; - complex *ucol, *lusup; - - iword = sizeof(int); - dword = sizeof(complex); - ndim = Glu->n; - - xlsub = Glu->xlsub; - lsub = Glu->lsub; - xusub = Glu->xusub; - usub = Glu->usub; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - lusup = Glu->lusup; - - dfrom = ucol; - dto = (complex *)((char*)lusup + xlusup[ndim] * dword); - copy_mem_complex(xusub[ndim], dfrom, dto); - ucol = dto; - - ifrom = lsub; - ito = (int *) ((char*)ucol + xusub[ndim] * iword); - copy_mem_int(xlsub[ndim], ifrom, ito); - lsub = ito; - - ifrom = usub; - ito = (int *) ((char*)lsub + xlsub[ndim] * iword); - copy_mem_int(xusub[ndim], ifrom, ito); - usub = ito; - - last = (char*)usub + xusub[ndim] * iword; - fragment = (char*) (((char*)stack.array + stack.top1) - last); - stack.used -= (long int) fragment; - stack.top1 -= (long int) fragment; - - Glu->ucol = ucol; - Glu->lsub = lsub; - Glu->usub = usub; - -#ifdef DEBUG - printf("cStackCompress: fragment %d\n", fragment); - /* for (last = 0; last < ndim; ++last) - print_lu_col("After compress:", last, 0);*/ -#endif - -} - -/* - * Allocate storage for original matrix A - */ -void -callocateA(int n, int nnz, complex **a, int **asub, int **xa) -{ - *a = (complex *) complexMalloc(nnz); - *asub = (int *) intMalloc(nnz); - *xa = (int *) intMalloc(n+1); -} - - -complex *complexMalloc(int n) -{ - complex *buf; - buf = (complex *) SUPERLU_MALLOC((size_t)n * sizeof(complex)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in complexMalloc()\n"); - } - return (buf); -} - -complex *complexCalloc(int n) -{ - complex *buf; - register int i; - complex zero = {0.0, 0.0}; - buf = (complex *) SUPERLU_MALLOC((size_t)n * sizeof(complex)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in complexCalloc()\n"); - } - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - - -int cmemory_usage(const int nzlmax, const int nzumax, - const int nzlumax, const int n) -{ - register int iword, dword; - - iword = sizeof(int); - dword = sizeof(complex); - - return (10 * n * iword + - nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmyblas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmyblas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmyblas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,182 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: cmyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ -#include "slu_scomplex.h" - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void clsolve ( int ldm, int ncol, complex *M, complex *rhs ) -{ - int k; - complex x0, x1, x2, x3, temp; - complex *M0; - complex *Mki0, *Mki1, *Mki2, *Mki3; - register int firstcol = 0; - - M0 = &M[0]; - - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - cc_mult(&temp, &x0, Mki0); Mki0++; - c_sub(&x1, &rhs[firstcol+1], &temp); - cc_mult(&temp, &x0, Mki0); Mki0++; - c_sub(&x2, &rhs[firstcol+2], &temp); - cc_mult(&temp, &x1, Mki1); Mki1++; - c_sub(&x2, &x2, &temp); - cc_mult(&temp, &x0, Mki0); Mki0++; - c_sub(&x3, &rhs[firstcol+3], &temp); - cc_mult(&temp, &x1, Mki1); Mki1++; - c_sub(&x3, &x3, &temp); - cc_mult(&temp, &x2, Mki2); Mki2++; - c_sub(&x3, &x3, &temp); - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) { - cc_mult(&temp, &x0, Mki0); Mki0++; - c_sub(&rhs[k], &rhs[k], &temp); - cc_mult(&temp, &x1, Mki1); Mki1++; - c_sub(&rhs[k], &rhs[k], &temp); - cc_mult(&temp, &x2, Mki2); Mki2++; - c_sub(&rhs[k], &rhs[k], &temp); - cc_mult(&temp, &x3, Mki3); Mki3++; - c_sub(&rhs[k], &rhs[k], &temp); - } - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - cc_mult(&temp, &x0, Mki0); Mki0++; - c_sub(&x1, &rhs[firstcol+1], &temp); - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) { - cc_mult(&temp, &x0, Mki0); Mki0++; - c_sub(&rhs[k], &rhs[k], &temp); - cc_mult(&temp, &x1, Mki1); Mki1++; - c_sub(&rhs[k], &rhs[k], &temp); - } - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -cusolve ( ldm, ncol, M, rhs ) -int ldm; /* in */ -int ncol; /* in */ -complex *M; /* in */ -complex *rhs; /* modified */ -{ - complex xj, temp; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - c_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) { - cc_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */ - c_sub(&rhs[irow], &rhs[irow], &temp); - } - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void cmatvec ( ldm, nrow, ncol, M, vec, Mxvec ) -int ldm; /* in -- leading dimension of M */ -int nrow; /* in */ -int ncol; /* in */ -complex *M; /* in */ -complex *vec; /* in */ -complex *Mxvec; /* in/out */ -{ - complex vi0, vi1, vi2, vi3; - complex *M0, temp; - complex *Mki0, *Mki1, *Mki2, *Mki3; - register int firstcol = 0; - int k; - - M0 = &M[0]; - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) { - cc_mult(&temp, &vi0, Mki0); Mki0++; - c_add(&Mxvec[k], &Mxvec[k], &temp); - cc_mult(&temp, &vi1, Mki1); Mki1++; - c_add(&Mxvec[k], &Mxvec[k], &temp); - cc_mult(&temp, &vi2, Mki2); Mki2++; - c_add(&Mxvec[k], &Mxvec[k], &temp); - cc_mult(&temp, &vi3, Mki3); Mki3++; - c_add(&Mxvec[k], &Mxvec[k], &temp); - } - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) { - cc_mult(&temp, &vi0, Mki0); Mki0++; - c_add(&Mxvec[k], &Mxvec[k], &temp); - } - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Cnames.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Cnames.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Cnames.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Cnames.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 1, 1997 - * - * Changes made to this file addressing issue regarding calls to - * blas/lapack functions (Dec 2003 at LLNL) - */ -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. - * They have been modified to use of the new autoconf Fortran - * name mangling support, F77_FUNC, which is wrapped in the - * hypre_F90_NAME_BLAS macro. - */ - -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void hypre_F90_NAME_BLAS(dgemm,DGEMM)(...) - * - * This is the default. - */ - -#endif -/*** -#define hypre_F90_NAME_BLAS(dasum,DASUM) dasum -#define hypre_F90_NAME_BLAS(idamax,IDAMAX) idamax -#define hypre_F90_NAME_BLAS(dcopy,DCOPY) dcopy -#define hypre_F90_NAME_BLAS(dscal,DSCAL) dscal -#define hypre_F90_NAME_BLAS(dger,DGER) dger -#define hypre_F90_NAME_BLAS(dnrm2,DNRM2) dnrm2 -#define hypre_F90_NAME_BLAS(dsymv,DSYMV) dsymv -#define hypre_F90_NAME_BLAS(ddot,DDOT) ddot -#define hypre_F90_NAME_BLAS(daxpy,DAXPY) daxpy -#define hypre_F90_NAME_BLAS(dsyr2,DSYR2) dsyr2 -#define hypre_F90_NAME_BLAS(drot,DROT) drot -#define hypre_F90_NAME_BLAS(dgemv,DGEMV) dgemv -#define hypre_F90_NAME_BLAS(dtrsv,DTRSV) dtrsv -#define hypre_F90_NAME_BLAS(dgemm,DGEMM) dgemm -#define hypre_F90_NAME_BLAS(dtrsm,DTRSM) dtrsm - -#define hypre_F90_NAME_BLAS(xerbla,XERBLA) xerbla -#define hypre_F90_NAME_BLAS(dpotrf,DPOTRF) dpotrf -#define hypre_F90_NAME_BLAS(dgels,DGELS) dgels -#define hypre_F90_NAME_BLAS(dpotrs,DPOTRS) dpotrs -#define hypre_F90_NAME_BLAS(lsame,LSAME) lsame -#define hypre_F90_NAME_BLAS(dlamch,DLAMCH) dlamch - -#define c_bridge_dgssv_ c_bridge_dgssv -***/ -#endif /* __SUPERLU_CNAMES */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,3412 +0,0 @@ -/* ========================================================================== */ -/* === colamd/symamd - a sparse matrix column ordering algorithm ============ */ -/* ========================================================================== */ - -/* - colamd: an approximate minimum degree column ordering algorithm, - for LU factorization of symmetric or unsymmetric matrices, - QR factorization, least squares, interior point methods for - linear programming problems, and other related problems. - - symamd: an approximate minimum degree ordering algorithm for Cholesky - factorization of symmetric matrices. - - Purpose: - - Colamd computes a permutation Q such that the Cholesky factorization of - (AQ)'(AQ) has less fill-in and requires fewer floating point operations - than A'A. This also provides a good ordering for sparse partial - pivoting methods, P(AQ) = LU, where Q is computed prior to numerical - factorization, and P is computed during numerical factorization via - conventional partial pivoting with row interchanges. Colamd is the - column ordering method used in SuperLU, part of the ScaLAPACK library. - It is also available as built-in function in MATLAB Version 6, - available from MathWorks, Inc. (http://www.mathworks.com). This - routine can be used in place of colmmd in MATLAB. - - Symamd computes a permutation P of a symmetric matrix A such that the - Cholesky factorization of PAP' has less fill-in and requires fewer - floating point operations than A. Symamd constructs a matrix M such - that M'M has the same nonzero pattern of A, and then orders the columns - of M using colmmd. The column ordering of M is then returned as the - row and column ordering P of A. - - Authors: - - The authors of the code itself are Stefan I. Larimore and Timothy A. - Davis (davis@cise.ufl.edu), University of Florida. The algorithm was - developed in collaboration with John Gilbert, Xerox PARC, and Esmond - Ng, Oak Ridge National Laboratory. - - Date: - - September 8, 2003. Version 2.3. - - Acknowledgements: - - This work was supported by the National Science Foundation, under - grants DMS-9504974 and DMS-9803599. - - Copyright and License: - - Copyright (c) 1998-2003 by the University of Florida. - All Rights Reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use, copy, modify, and/or distribute - this program, provided that the Copyright, this License, and the - Availability of the original version is retained on all copies and made - accessible to the end-user of any code or package that includes COLAMD - or any modified version of COLAMD. - - Availability: - - The colamd/symamd library is available at - - http://www.cise.ufl.edu/research/sparse/colamd/ - - This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c - file. It requires the colamd.h file. It is required by the colamdmex.c - and symamdmex.c files, for the MATLAB interface to colamd and symamd. - - See the ChangeLog file for changes since Version 1.0. - -*/ - -/* ========================================================================== */ -/* === Description of user-callable routines ================================ */ -/* ========================================================================== */ - -/* - ---------------------------------------------------------------------------- - colamd_recommended: - ---------------------------------------------------------------------------- - - C syntax: - - #include "colamd.h" - int colamd_recommended (int nnz, int n_row, int n_col) ; - - or as a C macro - - #include "colamd.h" - Alen = COLAMD_RECOMMENDED (int nnz, int n_row, int n_col) ; - - Purpose: - - Returns recommended value of Alen for use by colamd. Returns -1 - if any input argument is negative. The use of this routine - or macro is optional. Note that the macro uses its arguments - more than once, so be careful for side effects, if you pass - expressions as arguments to COLAMD_RECOMMENDED. Not needed for - symamd, which dynamically allocates its own memory. - - Arguments (all input arguments): - - int nnz ; Number of nonzeros in the matrix A. This must - be the same value as p [n_col] in the call to - colamd - otherwise you will get a wrong value - of the recommended memory to use. - - int n_row ; Number of rows in the matrix A. - - int n_col ; Number of columns in the matrix A. - - ---------------------------------------------------------------------------- - colamd_set_defaults: - ---------------------------------------------------------------------------- - - C syntax: - - #include "colamd.h" - colamd_set_defaults (double knobs [COLAMD_KNOBS]) ; - - Purpose: - - Sets the default parameters. The use of this routine is optional. - - Arguments: - - double knobs [COLAMD_KNOBS] ; Output only. - - Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) - entries are removed prior to ordering. Columns with more than - (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to - ordering, and placed last in the output column ordering. - - Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0]. - Rows and columns with more than (knobs [COLAMD_DENSE_ROW] * n) - entries are removed prior to ordering, and placed last in the - output ordering. - - COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1, - respectively, in colamd.h. Default values of these two knobs - are both 0.5. Currently, only knobs [0] and knobs [1] are - used, but future versions may use more knobs. If so, they will - be properly set to their defaults by the future version of - colamd_set_defaults, so that the code that calls colamd will - not need to change, assuming that you either use - colamd_set_defaults, or pass a (double *) NULL pointer as the - knobs array to colamd or symamd. - - ---------------------------------------------------------------------------- - colamd: - ---------------------------------------------------------------------------- - - C syntax: - - #include "colamd.h" - int colamd (int n_row, int n_col, int Alen, int *A, int *p, - double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ; - - Purpose: - - Computes a column ordering (Q) of A such that P(AQ)=LU or - (AQ)'AQ=LL' have less fill-in and require fewer floating point - operations than factorizing the unpermuted matrix A or A'A, - respectively. - - Returns: - - TRUE (1) if successful, FALSE (0) otherwise. - - Arguments: - - int n_row ; Input argument. - - Number of rows in the matrix A. - Restriction: n_row >= 0. - Colamd returns FALSE if n_row is negative. - - int n_col ; Input argument. - - Number of columns in the matrix A. - Restriction: n_col >= 0. - Colamd returns FALSE if n_col is negative. - - int Alen ; Input argument. - - Restriction (see note): - Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col - Colamd returns FALSE if these conditions are not met. - - Note: this restriction makes an modest assumption regarding - the size of the two typedef's structures in colamd.h. - We do, however, guarantee that - - Alen >= colamd_recommended (nnz, n_row, n_col) - - or equivalently as a C preprocessor macro: - - Alen >= COLAMD_RECOMMENDED (nnz, n_row, n_col) - - will be sufficient. - - int A [Alen] ; Input argument, undefined on output. - - A is an integer array of size Alen. Alen must be at least as - large as the bare minimum value given above, but this is very - low, and can result in excessive run time. For best - performance, we recommend that Alen be greater than or equal to - colamd_recommended (nnz, n_row, n_col), which adds - nnz/5 to the bare minimum value given above. - - On input, the row indices of the entries in column c of the - matrix are held in A [(p [c]) ... (p [c+1]-1)]. The row indices - in a given column c need not be in ascending order, and - duplicate row indices may be be present. However, colamd will - work a little faster if both of these conditions are met - (Colamd puts the matrix into this format, if it finds that the - the conditions are not met). - - The matrix is 0-based. That is, rows are in the range 0 to - n_row-1, and columns are in the range 0 to n_col-1. Colamd - returns FALSE if any row index is out of range. - - The contents of A are modified during ordering, and are - undefined on output. - - int p [n_col+1] ; Both input and output argument. - - p is an integer array of size n_col+1. On input, it holds the - "pointers" for the column form of the matrix A. Column c of - the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first - entry, p [0], must be zero, and p [c] <= p [c+1] must hold - for all c in the range 0 to n_col-1. The value p [n_col] is - thus the total number of entries in the pattern of the matrix A. - Colamd returns FALSE if these conditions are not met. - - On output, if colamd returns TRUE, the array p holds the column - permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is - the first column index in the new ordering, and p [n_col-1] is - the last. That is, p [k] = j means that column j of A is the - kth pivot column, in AQ, where k is in the range 0 to n_col-1 - (p [0] = j means that column j of A is the first column in AQ). - - If colamd returns FALSE, then no permutation is returned, and - p is undefined on output. - - double knobs [COLAMD_KNOBS] ; Input argument. - - See colamd_set_defaults for a description. - - int stats [COLAMD_STATS] ; Output argument. - - Statistics on the ordering, and error status. - See colamd.h for related definitions. - Colamd returns FALSE if stats is not present. - - stats [0]: number of dense or empty rows ignored. - - stats [1]: number of dense or empty columns ignored (and - ordered last in the output permutation p) - Note that a row can become "empty" if it - contains only "dense" and/or "empty" columns, - and similarly a column can become "empty" if it - only contains "dense" and/or "empty" rows. - - stats [2]: number of garbage collections performed. - This can be excessively high if Alen is close - to the minimum required value. - - stats [3]: status code. < 0 is an error code. - > 1 is a warning or notice. - - 0 OK. Each column of the input matrix contained - row indices in increasing order, with no - duplicates. - - 1 OK, but columns of input matrix were jumbled - (unsorted columns or duplicate entries). Colamd - had to do some extra work to sort the matrix - first and remove duplicate entries, but it - still was able to return a valid permutation - (return value of colamd was TRUE). - - stats [4]: highest numbered column that - is unsorted or has duplicate - entries. - stats [5]: last seen duplicate or - unsorted row index. - stats [6]: number of duplicate or - unsorted row indices. - - -1 A is a null pointer - - -2 p is a null pointer - - -3 n_row is negative - - stats [4]: n_row - - -4 n_col is negative - - stats [4]: n_col - - -5 number of nonzeros in matrix is negative - - stats [4]: number of nonzeros, p [n_col] - - -6 p [0] is nonzero - - stats [4]: p [0] - - -7 A is too small - - stats [4]: required size - stats [5]: actual size (Alen) - - -8 a column has a negative number of entries - - stats [4]: column with < 0 entries - stats [5]: number of entries in col - - -9 a row index is out of bounds - - stats [4]: column with bad row index - stats [5]: bad row index - stats [6]: n_row, # of rows of matrx - - -10 (unused; see symamd.c) - - -999 (unused; see symamd.c) - - Future versions may return more statistics in the stats array. - - Example: - - See http://www.cise.ufl.edu/research/sparse/colamd/example.c - for a complete example. - - To order the columns of a 5-by-4 matrix with 11 nonzero entries in - the following nonzero pattern - - x 0 x 0 - x 0 x x - 0 x x 0 - 0 0 x x - x x 0 0 - - with default knobs and no output statistics, do the following: - - #include "colamd.h" - #define ALEN COLAMD_RECOMMENDED (11, 5, 4) - int A [ALEN] = {1, 2, 5, 3, 5, 1, 2, 3, 4, 2, 4} ; - int p [ ] = {0, 3, 5, 9, 11} ; - int stats [COLAMD_STATS] ; - colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ; - - The permutation is returned in the array p, and A is destroyed. - - ---------------------------------------------------------------------------- - symamd: - ---------------------------------------------------------------------------- - - C syntax: - - #include "colamd.h" - int symamd (int n, int *A, int *p, int *perm, - double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS], - void (*allocate) (size_t, size_t), void (*release) (void *)) ; - - Purpose: - - The symamd routine computes an ordering P of a symmetric sparse - matrix A such that the Cholesky factorization PAP' = LL' remains - sparse. It is based on a column ordering of a matrix M constructed - so that the nonzero pattern of M'M is the same as A. The matrix A - is assumed to be symmetric; only the strictly lower triangular part - is accessed. You must pass your selected memory allocator (usually - calloc/free or mxCalloc/mxFree) to symamd, for it to allocate - memory for the temporary matrix M. - - Returns: - - TRUE (1) if successful, FALSE (0) otherwise. - - Arguments: - - int n ; Input argument. - - Number of rows and columns in the symmetrix matrix A. - Restriction: n >= 0. - Symamd returns FALSE if n is negative. - - int A [nnz] ; Input argument. - - A is an integer array of size nnz, where nnz = p [n]. - - The row indices of the entries in column c of the matrix are - held in A [(p [c]) ... (p [c+1]-1)]. The row indices in a - given column c need not be in ascending order, and duplicate - row indices may be present. However, symamd will run faster - if the columns are in sorted order with no duplicate entries. - - The matrix is 0-based. That is, rows are in the range 0 to - n-1, and columns are in the range 0 to n-1. Symamd - returns FALSE if any row index is out of range. - - The contents of A are not modified. - - int p [n+1] ; Input argument. - - p is an integer array of size n+1. On input, it holds the - "pointers" for the column form of the matrix A. Column c of - the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first - entry, p [0], must be zero, and p [c] <= p [c+1] must hold - for all c in the range 0 to n-1. The value p [n] is - thus the total number of entries in the pattern of the matrix A. - Symamd returns FALSE if these conditions are not met. - - The contents of p are not modified. - - int perm [n+1] ; Output argument. - - On output, if symamd returns TRUE, the array perm holds the - permutation P, where perm [0] is the first index in the new - ordering, and perm [n-1] is the last. That is, perm [k] = j - means that row and column j of A is the kth column in PAP', - where k is in the range 0 to n-1 (perm [0] = j means - that row and column j of A are the first row and column in - PAP'). The array is used as a workspace during the ordering, - which is why it must be of length n+1, not just n. - - double knobs [COLAMD_KNOBS] ; Input argument. - - See colamd_set_defaults for a description. - - int stats [COLAMD_STATS] ; Output argument. - - Statistics on the ordering, and error status. - See colamd.h for related definitions. - Symamd returns FALSE if stats is not present. - - stats [0]: number of dense or empty row and columns ignored - (and ordered last in the output permutation - perm). Note that a row/column can become - "empty" if it contains only "dense" and/or - "empty" columns/rows. - - stats [1]: (same as stats [0]) - - stats [2]: number of garbage collections performed. - - stats [3]: status code. < 0 is an error code. - > 1 is a warning or notice. - - 0 OK. Each column of the input matrix contained - row indices in increasing order, with no - duplicates. - - 1 OK, but columns of input matrix were jumbled - (unsorted columns or duplicate entries). Symamd - had to do some extra work to sort the matrix - first and remove duplicate entries, but it - still was able to return a valid permutation - (return value of symamd was TRUE). - - stats [4]: highest numbered column that - is unsorted or has duplicate - entries. - stats [5]: last seen duplicate or - unsorted row index. - stats [6]: number of duplicate or - unsorted row indices. - - -1 A is a null pointer - - -2 p is a null pointer - - -3 (unused, see colamd.c) - - -4 n is negative - - stats [4]: n - - -5 number of nonzeros in matrix is negative - - stats [4]: # of nonzeros (p [n]). - - -6 p [0] is nonzero - - stats [4]: p [0] - - -7 (unused) - - -8 a column has a negative number of entries - - stats [4]: column with < 0 entries - stats [5]: number of entries in col - - -9 a row index is out of bounds - - stats [4]: column with bad row index - stats [5]: bad row index - stats [6]: n_row, # of rows of matrx - - -10 out of memory (unable to allocate temporary - workspace for M or count arrays using the - "allocate" routine passed into symamd). - - -999 internal error. colamd failed to order the - matrix M, when it should have succeeded. This - indicates a bug. If this (and *only* this) - error code occurs, please contact the authors. - Don't contact the authors if you get any other - error code. - - Future versions may return more statistics in the stats array. - - void * (*allocate) (size_t, size_t) - - A pointer to a function providing memory allocation. The - allocated memory must be returned initialized to zero. For a - C application, this argument should normally be a pointer to - calloc. For a MATLAB mexFunction, the routine mxCalloc is - passed instead. - - void (*release) (size_t, size_t) - - A pointer to a function that frees memory allocated by the - memory allocation routine above. For a C application, this - argument should normally be a pointer to free. For a MATLAB - mexFunction, the routine mxFree is passed instead. - - - ---------------------------------------------------------------------------- - colamd_report: - ---------------------------------------------------------------------------- - - C syntax: - - #include "colamd.h" - colamd_report (int stats [COLAMD_STATS]) ; - - Purpose: - - Prints the error status and statistics recorded in the stats - array on the standard error output (for a standard C routine) - or on the MATLAB output (for a mexFunction). - - Arguments: - - int stats [COLAMD_STATS] ; Input only. Statistics from colamd. - - - ---------------------------------------------------------------------------- - symamd_report: - ---------------------------------------------------------------------------- - - C syntax: - - #include "colamd.h" - symamd_report (int stats [COLAMD_STATS]) ; - - Purpose: - - Prints the error status and statistics recorded in the stats - array on the standard error output (for a standard C routine) - or on the MATLAB output (for a mexFunction). - - Arguments: - - int stats [COLAMD_STATS] ; Input only. Statistics from symamd. - - -*/ - -/* ========================================================================== */ -/* === Scaffolding code definitions ======================================== */ -/* ========================================================================== */ - -/* Ensure that debugging is turned off: */ -#ifndef NDEBUG -#define NDEBUG -#endif /* NDEBUG */ - -/* - Our "scaffolding code" philosophy: In our opinion, well-written library - code should keep its "debugging" code, and just normally have it turned off - by the compiler so as not to interfere with performance. This serves - several purposes: - - (1) assertions act as comments to the reader, telling you what the code - expects at that point. All assertions will always be true (unless - there really is a bug, of course). - - (2) leaving in the scaffolding code assists anyone who would like to modify - the code, or understand the algorithm (by reading the debugging output, - one can get a glimpse into what the code is doing). - - (3) (gasp!) for actually finding bugs. This code has been heavily tested - and "should" be fully functional and bug-free ... but you never know... - - To enable debugging, comment out the "#define NDEBUG" above. For a MATLAB - mexFunction, you will also need to modify mexopts.sh to remove the -DNDEBUG - definition. The code will become outrageously slow when debugging is - enabled. To control the level of debugging output, set an environment - variable D to 0 (little), 1 (some), 2, 3, or 4 (lots). When debugging, - you should see the following message on the standard output: - - colamd: debug version, D = 1 (THIS WILL BE SLOW!) - - or a similar message for symamd. If you don't, then debugging has not - been enabled. - -*/ - -/* ========================================================================== */ -/* === Include files ======================================================== */ -/* ========================================================================== */ - -#include "colamd.h" -#include - -#ifdef MATLAB_MEX_FILE -#include "mex.h" -#include "matrix.h" -#else -#include -#include -#endif /* MATLAB_MEX_FILE */ - -/* ========================================================================== */ -/* === Definitions ========================================================== */ -/* ========================================================================== */ - -/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */ -#define PUBLIC -#define PRIVATE static - -#define MAX(a,b) (((a) > (b)) ? (a) : (b)) -#define MIN(a,b) (((a) < (b)) ? (a) : (b)) - -#define ONES_COMPLEMENT(r) (-(r)-1) - -/* -------------------------------------------------------------------------- */ -/* Change for version 2.1: define TRUE and FALSE only if not yet defined */ -/* -------------------------------------------------------------------------- */ - -#ifndef TRUE -#define TRUE (1) -#endif - -#ifndef FALSE -#define FALSE (0) -#endif - -/* -------------------------------------------------------------------------- */ - -#define EMPTY (-1) - -/* Row and column status */ -#define ALIVE (0) -#define DEAD (-1) - -/* Column status */ -#define DEAD_PRINCIPAL (-1) -#define DEAD_NON_PRINCIPAL (-2) - -/* Macros for row and column status update and checking. */ -#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } - -/* ========================================================================== */ -/* === Colamd reporting mechanism =========================================== */ -/* ========================================================================== */ - -#ifdef MATLAB_MEX_FILE - -/* use mexPrintf in a MATLAB mexFunction, for debugging and statistics output */ -#define PRINTF mexPrintf - -/* In MATLAB, matrices are 1-based to the user, but 0-based internally */ -#define INDEX(i) ((i)+1) - -#else - -/* Use printf in standard C environment, for debugging and statistics output. */ -/* Output is generated only if debugging is enabled at compile time, or if */ -/* the caller explicitly calls colamd_report or symamd_report. */ -#define PRINTF printf - -/* In C, matrices are 0-based and indices are reported as such in *_report */ -#define INDEX(i) (i) - -#endif /* MATLAB_MEX_FILE */ - -/* ========================================================================== */ -/* === Prototypes of PRIVATE routines ======================================= */ -/* ========================================================================== */ - -PRIVATE int init_rows_cols -( - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [], - int p [], - int stats [COLAMD_STATS] -) ; - -PRIVATE void init_scoring -( - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [], - int head [], - double knobs [COLAMD_KNOBS], - int *p_n_row2, - int *p_n_col2, - int *p_max_deg -) ; - -PRIVATE int find_ordering -( - int n_row, - int n_col, - int Alen, - Colamd_Row Row [], - Colamd_Col Col [], - int A [], - int head [], - int n_col2, - int max_deg, - int pfree -) ; - -PRIVATE void order_children -( - int n_col, - Colamd_Col Col [], - int p [] -) ; - -PRIVATE void detect_super_cols -( - -#ifndef NDEBUG - int n_col, - Colamd_Row Row [], -#endif /* NDEBUG */ - - Colamd_Col Col [], - int A [], - int head [], - int row_start, - int row_length -) ; - -PRIVATE int garbage_collection -( - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [], - int *pfree -) ; - -PRIVATE int clear_mark -( - int n_row, - Colamd_Row Row [] -) ; - -PRIVATE void print_report -( - const char *method, - int stats [COLAMD_STATS] -) ; - -/* ========================================================================== */ -/* === Debugging prototypes and definitions ================================= */ -/* ========================================================================== */ - -#ifndef NDEBUG - -/* colamd_debug is the *ONLY* global variable, and is only */ -/* present when debugging */ - -PRIVATE int colamd_debug ; /* debug print level */ - -#define DEBUG0(params) { (void) PRINTF params ; } -#define DEBUG1(params) { if (colamd_debug >= 1) (void) PRINTF params ; } -#define DEBUG2(params) { if (colamd_debug >= 2) (void) PRINTF params ; } -#define DEBUG3(params) { if (colamd_debug >= 3) (void) PRINTF params ; } -#define DEBUG4(params) { if (colamd_debug >= 4) (void) PRINTF params ; } - -#ifdef MATLAB_MEX_FILE -#define ASSERT(expression) (mxAssert ((expression), "")) -#else -#define ASSERT(expression) (assert (expression)) -#endif /* MATLAB_MEX_FILE */ - -PRIVATE void colamd_get_debug /* gets the debug print level from getenv */ -( - char *method -) ; - -PRIVATE void debug_deg_lists -( - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int head [], - int min_score, - int should, - int max_deg -) ; - -PRIVATE void debug_mark -( - int n_row, - Colamd_Row Row [], - int tag_mark, - int max_mark -) ; - -PRIVATE void debug_matrix -( - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [] -) ; - -PRIVATE void debug_structures -( - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [], - int n_col2 -) ; - -#else /* NDEBUG */ - -/* === No debugging ========================================================= */ - -#define DEBUG0(params) ; -#define DEBUG1(params) ; -#define DEBUG2(params) ; -#define DEBUG3(params) ; -#define DEBUG4(params) ; - -#define ASSERT(expression) ((void) 0) - -#endif /* NDEBUG */ - -/* ========================================================================== */ - - - -/* ========================================================================== */ -/* === USER-CALLABLE ROUTINES: ============================================== */ -/* ========================================================================== */ - - -/* ========================================================================== */ -/* === colamd_recommended =================================================== */ -/* ========================================================================== */ - -/* - The colamd_recommended routine returns the suggested size for Alen. This - value has been determined to provide good balance between the number of - garbage collections and the memory requirements for colamd. If any - argument is negative, a -1 is returned as an error condition. This - function is also available as a macro defined in colamd.h, so that you - can use it for a statically-allocated array size. -*/ - -PUBLIC int colamd_recommended /* returns recommended value of Alen. */ -( - /* === Parameters ======================================================= */ - - int nnz, /* number of nonzeros in A */ - int n_row, /* number of rows in A */ - int n_col /* number of columns in A */ -) -{ - return (COLAMD_RECOMMENDED (nnz, n_row, n_col)) ; -} - - -/* ========================================================================== */ -/* === colamd_set_defaults ================================================== */ -/* ========================================================================== */ - -/* - The colamd_set_defaults routine sets the default values of the user- - controllable parameters for colamd: - - knobs [0] rows with knobs[0]*n_col entries or more are removed - prior to ordering in colamd. Rows and columns with - knobs[0]*n_col entries or more are removed prior to - ordering in symamd and placed last in the output - ordering. - - knobs [1] columns with knobs[1]*n_row entries or more are removed - prior to ordering in colamd, and placed last in the - column permutation. Symamd ignores this knob. - - knobs [2..19] unused, but future versions might use this -*/ - -PUBLIC void colamd_set_defaults -( - /* === Parameters ======================================================= */ - - double knobs [COLAMD_KNOBS] /* knob array */ -) -{ - /* === Local variables ================================================== */ - - int i ; - - if (!knobs) - { - return ; /* no knobs to initialize */ - } - for (i = 0 ; i < COLAMD_KNOBS ; i++) - { - knobs [i] = 0 ; - } - knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ - knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ -} - - -/* ========================================================================== */ -/* === symamd =============================================================== */ -/* ========================================================================== */ - -PUBLIC int symamd /* return TRUE if OK, FALSE otherwise */ -( - /* === Parameters ======================================================= */ - - int n, /* number of rows and columns of A */ - int A [], /* row indices of A */ - int p [], /* column pointers of A */ - int perm [], /* output permutation, size n+1 */ - double knobs [COLAMD_KNOBS], /* parameters (uses defaults if NULL) */ - int stats [COLAMD_STATS], /* output statistics and error codes */ - void * (*allocate) (size_t, size_t), - /* pointer to calloc (ANSI C) or */ - /* mxCalloc (for MATLAB mexFunction) */ - void (*release) (void *) - /* pointer to free (ANSI C) or */ - /* mxFree (for MATLAB mexFunction) */ -) -{ - /* === Local variables ================================================== */ - - int *count ; /* length of each column of M, and col pointer*/ - int *mark ; /* mark array for finding duplicate entries */ - int *M ; /* row indices of matrix M */ - int Mlen ; /* length of M */ - int n_row ; /* number of rows in M */ - int nnz ; /* number of entries in A */ - int i ; /* row index of A */ - int j ; /* column index of A */ - int k ; /* row index of M */ - int mnz ; /* number of nonzeros in M */ - int pp ; /* index into a column of A */ - int last_row ; /* last row seen in the current column */ - int length ; /* number of nonzeros in a column */ - - double cknobs [COLAMD_KNOBS] ; /* knobs for colamd */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs for colamd */ - int cstats [COLAMD_STATS] ; /* colamd stats */ - -#ifndef NDEBUG - colamd_get_debug ("symamd") ; -#endif /* NDEBUG */ - - /* === Check the input arguments ======================================== */ - - if (!stats) - { - DEBUG0 (("symamd: stats not present\n")) ; - return (FALSE) ; - } - for (i = 0 ; i < COLAMD_STATS ; i++) - { - stats [i] = 0 ; - } - stats [COLAMD_STATUS] = COLAMD_OK ; - stats [COLAMD_INFO1] = -1 ; - stats [COLAMD_INFO2] = -1 ; - - if (!A) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; - DEBUG0 (("symamd: A not present\n")) ; - return (FALSE) ; - } - - if (!p) /* p is not present */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; - DEBUG0 (("symamd: p not present\n")) ; - return (FALSE) ; - } - - if (n < 0) /* n must be >= 0 */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; - stats [COLAMD_INFO1] = n ; - DEBUG0 (("symamd: n negative %d\n", n)) ; - return (FALSE) ; - } - - nnz = p [n] ; - if (nnz < 0) /* nnz must be >= 0 */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; - stats [COLAMD_INFO1] = nnz ; - DEBUG0 (("symamd: number of entries negative %d\n", nnz)) ; - return (FALSE) ; - } - - if (p [0] != 0) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; - stats [COLAMD_INFO1] = p [0] ; - DEBUG0 (("symamd: p[0] not zero %d\n", p [0])) ; - return (FALSE) ; - } - - /* === If no knobs, set default knobs =================================== */ - - if (!knobs) - { - colamd_set_defaults (default_knobs) ; - knobs = default_knobs ; - } - - /* === Allocate count and mark ========================================== */ - - count = (int *) ((*allocate) (n+1, sizeof (int))) ; - if (!count) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ; - DEBUG0 (("symamd: allocate count (size %d) failed\n", n+1)) ; - return (FALSE) ; - } - - mark = (int *) ((*allocate) (n+1, sizeof (int))) ; - if (!mark) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ; - (*release) ((void *) count) ; - DEBUG0 (("symamd: allocate mark (size %d) failed\n", n+1)) ; - return (FALSE) ; - } - - /* === Compute column counts of M, check if A is valid ================== */ - - stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ - - for (i = 0 ; i < n ; i++) - { - mark [i] = -1 ; - } - - for (j = 0 ; j < n ; j++) - { - last_row = -1 ; - - length = p [j+1] - p [j] ; - if (length < 0) - { - /* column pointers must be non-decreasing */ - stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; - stats [COLAMD_INFO1] = j ; - stats [COLAMD_INFO2] = length ; - (*release) ((void *) count) ; - (*release) ((void *) mark) ; - DEBUG0 (("symamd: col %d negative length %d\n", j, length)) ; - return (FALSE) ; - } - - for (pp = p [j] ; pp < p [j+1] ; pp++) - { - i = A [pp] ; - if (i < 0 || i >= n) - { - /* row index i, in column j, is out of bounds */ - stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; - stats [COLAMD_INFO1] = j ; - stats [COLAMD_INFO2] = i ; - stats [COLAMD_INFO3] = n ; - (*release) ((void *) count) ; - (*release) ((void *) mark) ; - DEBUG0 (("symamd: row %d col %d out of bounds\n", i, j)) ; - return (FALSE) ; - } - - if (i <= last_row || mark [i] == j) - { - /* row index is unsorted or repeated (or both), thus col */ - /* is jumbled. This is a notice, not an error condition. */ - stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; - stats [COLAMD_INFO1] = j ; - stats [COLAMD_INFO2] = i ; - (stats [COLAMD_INFO3]) ++ ; - DEBUG1 (("symamd: row %d col %d unsorted/duplicate\n", i, j)) ; - } - - if (i > j && mark [i] != j) - { - /* row k of M will contain column indices i and j */ - count [i]++ ; - count [j]++ ; - } - - /* mark the row as having been seen in this column */ - mark [i] = j ; - - last_row = i ; - } - } - - if (stats [COLAMD_STATUS] == COLAMD_OK) - { - /* if there are no duplicate entries, then mark is no longer needed */ - (*release) ((void *) mark) ; - } - - /* === Compute column pointers of M ===================================== */ - - /* use output permutation, perm, for column pointers of M */ - perm [0] = 0 ; - for (j = 1 ; j <= n ; j++) - { - perm [j] = perm [j-1] + count [j-1] ; - } - for (j = 0 ; j < n ; j++) - { - count [j] = perm [j] ; - } - - /* === Construct M ====================================================== */ - - mnz = perm [n] ; - n_row = mnz / 2 ; - Mlen = colamd_recommended (mnz, n_row, n) ; - M = (int *) ((*allocate) (Mlen, sizeof (int))) ; - DEBUG0 (("symamd: M is %d-by-%d with %d entries, Mlen = %d\n", - n_row, n, mnz, Mlen)) ; - - if (!M) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ; - (*release) ((void *) count) ; - (*release) ((void *) mark) ; - DEBUG0 (("symamd: allocate M (size %d) failed\n", Mlen)) ; - return (FALSE) ; - } - - k = 0 ; - - if (stats [COLAMD_STATUS] == COLAMD_OK) - { - /* Matrix is OK */ - for (j = 0 ; j < n ; j++) - { - ASSERT (p [j+1] - p [j] >= 0) ; - for (pp = p [j] ; pp < p [j+1] ; pp++) - { - i = A [pp] ; - ASSERT (i >= 0 && i < n) ; - if (i > j) - { - /* row k of M contains column indices i and j */ - M [count [i]++] = k ; - M [count [j]++] = k ; - k++ ; - } - } - } - } - else - { - /* Matrix is jumbled. Do not add duplicates to M. Unsorted cols OK. */ - DEBUG0 (("symamd: Duplicates in A.\n")) ; - for (i = 0 ; i < n ; i++) - { - mark [i] = -1 ; - } - for (j = 0 ; j < n ; j++) - { - ASSERT (p [j+1] - p [j] >= 0) ; - for (pp = p [j] ; pp < p [j+1] ; pp++) - { - i = A [pp] ; - ASSERT (i >= 0 && i < n) ; - if (i > j && mark [i] != j) - { - /* row k of M contains column indices i and j */ - M [count [i]++] = k ; - M [count [j]++] = k ; - k++ ; - mark [i] = j ; - } - } - } - (*release) ((void *) mark) ; - } - - /* count and mark no longer needed */ - (*release) ((void *) count) ; - ASSERT (k == n_row) ; - - /* === Adjust the knobs for M =========================================== */ - - for (i = 0 ; i < COLAMD_KNOBS ; i++) - { - cknobs [i] = knobs [i] ; - } - - /* there are no dense rows in M */ - cknobs [COLAMD_DENSE_ROW] = 1.0 ; - - if (n_row != 0 && n < n_row) - { - /* On input, the knob is a fraction of 1..n, the number of rows of A. */ - /* Convert it to a fraction of 1..n_row, of the number of rows of M. */ - cknobs [COLAMD_DENSE_COL] = (knobs [COLAMD_DENSE_ROW] * n) / n_row ; - } - else - { - /* no dense columns in M */ - cknobs [COLAMD_DENSE_COL] = 1.0 ; - } - - DEBUG0 (("symamd: dense col knob for M: %g\n", cknobs [COLAMD_DENSE_COL])) ; - - /* === Order the columns of M =========================================== */ - - if (!colamd (n_row, n, Mlen, M, perm, cknobs, cstats)) - { - /* This "cannot" happen, unless there is a bug in the code. */ - stats [COLAMD_STATUS] = COLAMD_ERROR_internal_error ; - (*release) ((void *) M) ; - DEBUG0 (("symamd: internal error!\n")) ; - return (FALSE) ; - } - - /* Note that the output permutation is now in perm */ - - /* === get the statistics for symamd from colamd ======================== */ - - /* note that a dense column in colamd means a dense row and col in symamd */ - stats [COLAMD_DENSE_ROW] = cstats [COLAMD_DENSE_COL] ; - stats [COLAMD_DENSE_COL] = cstats [COLAMD_DENSE_COL] ; - stats [COLAMD_DEFRAG_COUNT] = cstats [COLAMD_DEFRAG_COUNT] ; - - /* === Free M =========================================================== */ - - (*release) ((void *) M) ; - DEBUG0 (("symamd: done.\n")) ; - return (TRUE) ; - -} - -/* ========================================================================== */ -/* === colamd =============================================================== */ -/* ========================================================================== */ - -/* - The colamd routine computes a column ordering Q of a sparse matrix - A such that the LU factorization P(AQ) = LU remains sparse, where P is - selected via partial pivoting. The routine can also be viewed as - providing a permutation Q such that the Cholesky factorization - (AQ)'(AQ) = LL' remains sparse. -*/ - -PUBLIC int colamd /* returns TRUE if successful, FALSE otherwise*/ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows in A */ - int n_col, /* number of columns in A */ - int Alen, /* length of A */ - int A [], /* row indices of A */ - int p [], /* pointers to columns in A */ - double knobs [COLAMD_KNOBS],/* parameters (uses defaults if NULL) */ - int stats [COLAMD_STATS] /* output statistics and error codes */ -) -{ - /* === Local variables ================================================== */ - - int i ; /* loop index */ - int nnz ; /* nonzeros in A */ - int Row_size ; /* size of Row [], in integers */ - int Col_size ; /* size of Col [], in integers */ - int need ; /* minimum required length of A */ - Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */ - Colamd_Col *Col ; /* pointer into A of Col [0..n_col] array */ - int n_col2 ; /* number of non-dense, non-empty columns */ - int n_row2 ; /* number of non-dense, non-empty rows */ - int ngarbage ; /* number of garbage collections performed */ - int max_deg ; /* maximum row degree */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs array */ - -#ifndef NDEBUG - colamd_get_debug ("colamd") ; -#endif /* NDEBUG */ - - /* === Check the input arguments ======================================== */ - - if (!stats) - { - DEBUG0 (("colamd: stats not present\n")) ; - return (FALSE) ; - } - for (i = 0 ; i < COLAMD_STATS ; i++) - { - stats [i] = 0 ; - } - stats [COLAMD_STATUS] = COLAMD_OK ; - stats [COLAMD_INFO1] = -1 ; - stats [COLAMD_INFO2] = -1 ; - - if (!A) /* A is not present */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; - DEBUG0 (("colamd: A not present\n")) ; - return (FALSE) ; - } - - if (!p) /* p is not present */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; - DEBUG0 (("colamd: p not present\n")) ; - return (FALSE) ; - } - - if (n_row < 0) /* n_row must be >= 0 */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ; - stats [COLAMD_INFO1] = n_row ; - DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; - return (FALSE) ; - } - - if (n_col < 0) /* n_col must be >= 0 */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; - stats [COLAMD_INFO1] = n_col ; - DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; - return (FALSE) ; - } - - nnz = p [n_col] ; - if (nnz < 0) /* nnz must be >= 0 */ - { - stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; - stats [COLAMD_INFO1] = nnz ; - DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; - return (FALSE) ; - } - - if (p [0] != 0) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; - stats [COLAMD_INFO1] = p [0] ; - DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; - return (FALSE) ; - } - - /* === If no knobs, set default knobs =================================== */ - - if (!knobs) - { - colamd_set_defaults (default_knobs) ; - knobs = default_knobs ; - } - - /* === Allocate the Row and Col arrays from array A ===================== */ - - Col_size = COLAMD_C (n_col) ; - Row_size = COLAMD_R (n_row) ; - need = 2*nnz + n_col + Col_size + Row_size ; - - if (need > Alen) - { - /* not enough space in array A to perform the ordering */ - stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ; - stats [COLAMD_INFO1] = need ; - stats [COLAMD_INFO2] = Alen ; - DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); - return (FALSE) ; - } - - Alen -= Col_size + Row_size ; - Col = (Colamd_Col *) &A [Alen] ; - Row = (Colamd_Row *) &A [Alen + Col_size] ; - - /* === Construct the row and column data structures ===================== */ - - if (!init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) - { - /* input matrix is invalid */ - DEBUG0 (("colamd: Matrix invalid\n")) ; - return (FALSE) ; - } - - /* === Initialize scores, kill dense rows/columns ======================= */ - - init_scoring (n_row, n_col, Row, Col, A, p, knobs, - &n_row2, &n_col2, &max_deg) ; - - /* === Order the supercolumns =========================================== */ - - ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p, - n_col2, max_deg, 2*nnz) ; - - /* === Order the non-principal columns ================================== */ - - order_children (n_col, Col, p) ; - - /* === Return statistics in stats ======================================= */ - - stats [COLAMD_DENSE_ROW] = n_row - n_row2 ; - stats [COLAMD_DENSE_COL] = n_col - n_col2 ; - stats [COLAMD_DEFRAG_COUNT] = ngarbage ; - DEBUG0 (("colamd: done.\n")) ; - return (TRUE) ; -} - - -/* ========================================================================== */ -/* === colamd_report ======================================================== */ -/* ========================================================================== */ - -PUBLIC void colamd_report -( - int stats [COLAMD_STATS] -) -{ - print_report ("colamd", stats) ; -} - - -/* ========================================================================== */ -/* === symamd_report ======================================================== */ -/* ========================================================================== */ - -PUBLIC void symamd_report -( - int stats [COLAMD_STATS] -) -{ - print_report ("symamd", stats) ; -} - - - -/* ========================================================================== */ -/* === NON-USER-CALLABLE ROUTINES: ========================================== */ -/* ========================================================================== */ - -/* There are no user-callable routines beyond this point in the file */ - - -/* ========================================================================== */ -/* === init_rows_cols ======================================================= */ -/* ========================================================================== */ - -/* - Takes the column form of the matrix in A and creates the row form of the - matrix. Also, row and column attributes are stored in the Col and Row - structs. If the columns are un-sorted or contain duplicate row indices, - this routine will also sort and remove duplicate row indices from the - column form of the matrix. Returns FALSE if the matrix is invalid, - TRUE otherwise. Not user-callable. -*/ - -PRIVATE int init_rows_cols /* returns TRUE if OK, or FALSE otherwise */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - Colamd_Col Col [], /* of size n_col+1 */ - int A [], /* row indices of A, of size Alen */ - int p [], /* pointers to columns in A, of size n_col+1 */ - int stats [COLAMD_STATS] /* colamd statistics */ -) -{ - /* === Local variables ================================================== */ - - int col ; /* a column index */ - int row ; /* a row index */ - int *cp ; /* a column pointer */ - int *cp_end ; /* a pointer to the end of a column */ - int *rp ; /* a row pointer */ - int *rp_end ; /* a pointer to the end of a row */ - int last_row ; /* previous row */ - - /* === Initialize columns, and check column pointers ==================== */ - - for (col = 0 ; col < n_col ; col++) - { - Col [col].start = p [col] ; - Col [col].length = p [col+1] - p [col] ; - - if (Col [col].length < 0) - { - /* column pointers must be non-decreasing */ - stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = Col [col].length ; - DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; - return (FALSE) ; - } - - Col [col].shared1.thickness = 1 ; - Col [col].shared2.score = 0 ; - Col [col].shared3.prev = EMPTY ; - Col [col].shared4.degree_next = EMPTY ; - } - - /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ - - /* === Scan columns, compute row degrees, and check row indices ========= */ - - stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ - - for (row = 0 ; row < n_row ; row++) - { - Row [row].length = 0 ; - Row [row].shared2.mark = -1 ; - } - - for (col = 0 ; col < n_col ; col++) - { - last_row = -1 ; - - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - - while (cp < cp_end) - { - row = *cp++ ; - - /* make sure row indices within range */ - if (row < 0 || row >= n_row) - { - stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - stats [COLAMD_INFO3] = n_row ; - DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; - return (FALSE) ; - } - - if (row <= last_row || Row [row].shared2.mark == col) - { - /* row index are unsorted or repeated (or both), thus col */ - /* is jumbled. This is a notice, not an error condition. */ - stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - (stats [COLAMD_INFO3]) ++ ; - DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); - } - - if (Row [row].shared2.mark != col) - { - Row [row].length++ ; - } - else - { - /* this is a repeated entry in the column, */ - /* it will be removed */ - Col [col].length-- ; - } - - /* mark the row as having been seen in this column */ - Row [row].shared2.mark = col ; - - last_row = row ; - } - } - - /* === Compute row pointers ============================================= */ - - /* row form of the matrix starts directly after the column */ - /* form of matrix in A */ - Row [0].start = p [n_col] ; - Row [0].shared1.p = Row [0].start ; - Row [0].shared2.mark = -1 ; - for (row = 1 ; row < n_row ; row++) - { - Row [row].start = Row [row-1].start + Row [row-1].length ; - Row [row].shared1.p = Row [row].start ; - Row [row].shared2.mark = -1 ; - } - - /* === Create row form ================================================== */ - - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) - { - /* if cols jumbled, watch for repeated row indices */ - for (col = 0 ; col < n_col ; col++) - { - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - while (cp < cp_end) - { - row = *cp++ ; - if (Row [row].shared2.mark != col) - { - A [(Row [row].shared1.p)++] = col ; - Row [row].shared2.mark = col ; - } - } - } - } - else - { - /* if cols not jumbled, we don't need the mark (this is faster) */ - for (col = 0 ; col < n_col ; col++) - { - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - while (cp < cp_end) - { - A [(Row [*cp++].shared1.p)++] = col ; - } - } - } - - /* === Clear the row marks and set row degrees ========================== */ - - for (row = 0 ; row < n_row ; row++) - { - Row [row].shared2.mark = 0 ; - Row [row].shared1.degree = Row [row].length ; - } - - /* === See if we need to re-create columns ============================== */ - - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) - { - DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; - -#ifndef NDEBUG - /* make sure column lengths are correct */ - for (col = 0 ; col < n_col ; col++) - { - p [col] = Col [col].length ; - } - for (row = 0 ; row < n_row ; row++) - { - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - p [*rp++]-- ; - } - } - for (col = 0 ; col < n_col ; col++) - { - ASSERT (p [col] == 0) ; - } - /* now p is all zero (different than when debugging is turned off) */ -#endif /* NDEBUG */ - - /* === Compute col pointers ========================================= */ - - /* col form of the matrix starts at A [0]. */ - /* Note, we may have a gap between the col form and the row */ - /* form if there were duplicate entries, if so, it will be */ - /* removed upon the first garbage collection */ - Col [0].start = 0 ; - p [0] = Col [0].start ; - for (col = 1 ; col < n_col ; col++) - { - /* note that the lengths here are for pruned columns, i.e. */ - /* no duplicate row indices will exist for these columns */ - Col [col].start = Col [col-1].start + Col [col-1].length ; - p [col] = Col [col].start ; - } - - /* === Re-create col form =========================================== */ - - for (row = 0 ; row < n_row ; row++) - { - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - A [(p [*rp++])++] = row ; - } - } - } - - /* === Done. Matrix is not (or no longer) jumbled ====================== */ - - return (TRUE) ; -} - - -/* ========================================================================== */ -/* === init_scoring ========================================================= */ -/* ========================================================================== */ - -/* - Kills dense or empty columns and rows, calculates an initial score for - each column, and places all columns in the degree lists. Not user-callable. -*/ - -PRIVATE void init_scoring -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - Colamd_Col Col [], /* of size n_col+1 */ - int A [], /* column form and row form of A */ - int head [], /* of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameters */ - int *p_n_row2, /* number of non-dense, non-empty rows */ - int *p_n_col2, /* number of non-dense, non-empty columns */ - int *p_max_deg /* maximum row degree */ -) -{ - /* === Local variables ================================================== */ - - int c ; /* a column index */ - int r, row ; /* a row index */ - int *cp ; /* a column pointer */ - int deg ; /* degree of a row or column */ - int *cp_end ; /* a pointer to the end of a column */ - int *new_cp ; /* new column pointer */ - int col_length ; /* length of pruned column */ - int score ; /* current column score */ - int n_col2 ; /* number of non-dense, non-empty columns */ - int n_row2 ; /* number of non-dense, non-empty rows */ - int dense_row_count ; /* remove rows with more entries than this */ - int dense_col_count ; /* remove cols with more entries than this */ - int min_score ; /* smallest column score */ - int max_deg ; /* maximum row degree */ - int next_col ; /* Used to add to degree list.*/ - -#ifndef NDEBUG - int debug_count ; /* debug only. */ -#endif /* NDEBUG */ - - /* === Extract knobs ==================================================== */ - - dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; - dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; - DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; - max_deg = 0 ; - n_col2 = n_col ; - n_row2 = n_row ; - - /* === Kill empty columns =============================================== */ - - /* Put the empty columns at the end in their natural order, so that LU */ - /* factorization can proceed as far as possible. */ - for (c = n_col-1 ; c >= 0 ; c--) - { - deg = Col [c].length ; - if (deg == 0) - { - /* this is a empty column, kill and order it last */ - Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; - } - } - DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; - - /* === Kill dense columns =============================================== */ - - /* Put the dense columns at the end, in their natural order */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* skip any dead columns */ - if (COL_IS_DEAD (c)) - { - continue ; - } - deg = Col [c].length ; - if (deg > dense_col_count) - { - /* this is a dense column, kill and order it last */ - Col [c].shared2.order = --n_col2 ; - /* decrement the row degrees */ - cp = &A [Col [c].start] ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - Row [*cp++].shared1.degree-- ; - } - KILL_PRINCIPAL_COL (c) ; - } - } - DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; - - /* === Kill dense and empty rows ======================================== */ - - for (r = 0 ; r < n_row ; r++) - { - deg = Row [r].shared1.degree ; - ASSERT (deg >= 0 && deg <= n_col) ; - if (deg > dense_row_count || deg == 0) - { - /* kill a dense or empty row */ - KILL_ROW (r) ; - --n_row2 ; - } - else - { - /* keep track of max degree of remaining rows */ - max_deg = MAX (max_deg, deg) ; - } - } - DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ; - - /* === Compute initial column scores ==================================== */ - - /* At this point the row degrees are accurate. They reflect the number */ - /* of "live" (non-dense) columns in each row. No empty rows exist. */ - /* Some "live" columns may contain only dead rows, however. These are */ - /* pruned in the code below. */ - - /* now find the initial matlab score for each column */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* skip dead column */ - if (COL_IS_DEAD (c)) - { - continue ; - } - score = 0 ; - cp = &A [Col [c].start] ; - new_cp = cp ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - /* skip if dead */ - if (ROW_IS_DEAD (row)) - { - continue ; - } - /* compact the column */ - *new_cp++ = row ; - /* add row's external degree */ - score += Row [row].shared1.degree - 1 ; - /* guard against integer overflow */ - score = MIN (score, n_col) ; - } - /* determine pruned column length */ - col_length = (int) (new_cp - &A [Col [c].start]) ; - if (col_length == 0) - { - /* a newly-made null column (all rows in this col are "dense" */ - /* and have already been killed) */ - DEBUG2 (("Newly null killed: %d\n", c)) ; - Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; - } - else - { - /* set column length and set score */ - ASSERT (score >= 0) ; - ASSERT (score <= n_col) ; - Col [c].length = col_length ; - Col [c].shared2.score = score ; - } - } - DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n", - n_col-n_col2)) ; - - /* At this point, all empty rows and columns are dead. All live columns */ - /* are "clean" (containing no dead rows) and simplicial (no supercolumns */ - /* yet). Rows may contain dead columns, but all live rows contain at */ - /* least one live column. */ - -#ifndef NDEBUG - debug_structures (n_row, n_col, Row, Col, A, n_col2) ; -#endif /* NDEBUG */ - - /* === Initialize degree lists ========================================== */ - -#ifndef NDEBUG - debug_count = 0 ; -#endif /* NDEBUG */ - - /* clear the hash buckets */ - for (c = 0 ; c <= n_col ; c++) - { - head [c] = EMPTY ; - } - min_score = n_col ; - /* place in reverse order, so low column indices are at the front */ - /* of the lists. This is to encourage natural tie-breaking */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* only add principal columns to degree lists */ - if (COL_IS_ALIVE (c)) - { - DEBUG4 (("place %d score %d minscore %d ncol %d\n", - c, Col [c].shared2.score, min_score, n_col)) ; - - /* === Add columns score to DList =============================== */ - - score = Col [c].shared2.score ; - - ASSERT (min_score >= 0) ; - ASSERT (min_score <= n_col) ; - ASSERT (score >= 0) ; - ASSERT (score <= n_col) ; - ASSERT (head [score] >= EMPTY) ; - - /* now add this column to dList at proper score location */ - next_col = head [score] ; - Col [c].shared3.prev = EMPTY ; - Col [c].shared4.degree_next = next_col ; - - /* if there already was a column with the same score, set its */ - /* previous pointer to this new column */ - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = c ; - } - head [score] = c ; - - /* see if this score is less than current min */ - min_score = MIN (min_score, score) ; - -#ifndef NDEBUG - debug_count++ ; -#endif /* NDEBUG */ - - } - } - -#ifndef NDEBUG - DEBUG1 (("colamd: Live cols %d out of %d, non-princ: %d\n", - debug_count, n_col, n_col-debug_count)) ; - ASSERT (debug_count == n_col2) ; - debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ; -#endif /* NDEBUG */ - - /* === Return number of remaining columns, and max row degree =========== */ - - *p_n_col2 = n_col2 ; - *p_n_row2 = n_row2 ; - *p_max_deg = max_deg ; -} - - -/* ========================================================================== */ -/* === find_ordering ======================================================== */ -/* ========================================================================== */ - -/* - Order the principal columns of the supercolumn form of the matrix - (no supercolumns on input). Uses a minimum approximate column minimum - degree ordering method. Not user-callable. -*/ - -PRIVATE int find_ordering /* return the number of garbage collections */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - int Alen, /* size of A, 2*nnz + n_col or larger */ - Colamd_Row Row [], /* of size n_row+1 */ - Colamd_Col Col [], /* of size n_col+1 */ - int A [], /* column form and row form of A */ - int head [], /* of size n_col+1 */ - int n_col2, /* Remaining columns to order */ - int max_deg, /* Maximum row degree */ - int pfree /* index of first free slot (2*nnz on entry) */ -) -{ - /* === Local variables ================================================== */ - - int k ; /* current pivot ordering step */ - int pivot_col ; /* current pivot column */ - int *cp ; /* a column pointer */ - int *rp ; /* a row pointer */ - int pivot_row ; /* current pivot row */ - int *new_cp ; /* modified column pointer */ - int *new_rp ; /* modified row pointer */ - int pivot_row_start ; /* pointer to start of pivot row */ - int pivot_row_degree ; /* number of columns in pivot row */ - int pivot_row_length ; /* number of supercolumns in pivot row */ - int pivot_col_score ; /* score of pivot column */ - int needed_memory ; /* free space needed for pivot row */ - int *cp_end ; /* pointer to the end of a column */ - int *rp_end ; /* pointer to the end of a row */ - int row ; /* a row index */ - int col ; /* a column index */ - int max_score ; /* maximum possible score */ - int cur_score ; /* score of current column */ - unsigned int hash ; /* hash value for supernode detection */ - int head_column ; /* head of hash bucket */ - int first_col ; /* first column in hash bucket */ - int tag_mark ; /* marker value for mark array */ - int row_mark ; /* Row [row].shared2.mark */ - int set_difference ; /* set difference size of row with pivot row */ - int min_score ; /* smallest column score */ - int col_thickness ; /* "thickness" (no. of columns in a supercol) */ - int max_mark ; /* maximum value of tag_mark */ - int pivot_col_thickness ; /* number of columns represented by pivot col */ - int prev_col ; /* Used by Dlist operations. */ - int next_col ; /* Used by Dlist operations. */ - int ngarbage ; /* number of garbage collections performed */ - -#ifndef NDEBUG - int debug_d ; /* debug loop counter */ - int debug_step = 0 ; /* debug loop counter */ -#endif /* NDEBUG */ - - /* === Initialization and clear mark ==================================== */ - - max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = clear_mark (n_row, Row) ; - min_score = 0 ; - ngarbage = 0 ; - DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ; - - /* === Order the columns ================================================ */ - - for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */) - { - -#ifndef NDEBUG - if (debug_step % 100 == 0) - { - DEBUG2 (("\n... Step k: %d out of n_col2: %d\n", k, n_col2)) ; - } - else - { - DEBUG3 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ; - } - debug_step++ ; - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k, max_deg) ; - debug_matrix (n_row, n_col, Row, Col, A) ; -#endif /* NDEBUG */ - - /* === Select pivot column, and order it ============================ */ - - /* make sure degree list isn't empty */ - ASSERT (min_score >= 0) ; - ASSERT (min_score <= n_col) ; - ASSERT (head [min_score] >= EMPTY) ; - -#ifndef NDEBUG - for (debug_d = 0 ; debug_d < min_score ; debug_d++) - { - ASSERT (head [debug_d] == EMPTY) ; - } -#endif /* NDEBUG */ - - /* get pivot column from head of minimum degree list */ - while (head [min_score] == EMPTY && min_score < n_col) - { - min_score++ ; - } - pivot_col = head [min_score] ; - ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; - next_col = Col [pivot_col].shared4.degree_next ; - head [min_score] = next_col ; - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = EMPTY ; - } - - ASSERT (COL_IS_ALIVE (pivot_col)) ; - DEBUG3 (("Pivot col: %d\n", pivot_col)) ; - - /* remember score for defrag check */ - pivot_col_score = Col [pivot_col].shared2.score ; - - /* the pivot column is the kth column in the pivot order */ - Col [pivot_col].shared2.order = k ; - - /* increment order count by column thickness */ - pivot_col_thickness = Col [pivot_col].shared1.thickness ; - k += pivot_col_thickness ; - ASSERT (pivot_col_thickness > 0) ; - - /* === Garbage_collection, if necessary ============================= */ - - needed_memory = MIN (pivot_col_score, n_col - k) ; - if (pfree + needed_memory >= Alen) - { - pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; - ngarbage++ ; - /* after garbage collection we will have enough */ - ASSERT (pfree + needed_memory < Alen) ; - /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = clear_mark (n_row, Row) ; - -#ifndef NDEBUG - debug_matrix (n_row, n_col, Row, Col, A) ; -#endif /* NDEBUG */ - } - - /* === Compute pivot row pattern ==================================== */ - - /* get starting location for this new merged row */ - pivot_row_start = pfree ; - - /* initialize new row counts to zero */ - pivot_row_degree = 0 ; - - /* tag pivot column as having been visited so it isn't included */ - /* in merged pivot row */ - Col [pivot_col].shared1.thickness = -pivot_col_thickness ; - - /* pivot row is the union of all rows in the pivot column pattern */ - cp = &A [Col [pivot_col].start] ; - cp_end = cp + Col [pivot_col].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; - /* skip if row is dead */ - if (ROW_IS_DEAD (row)) - { - continue ; - } - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - /* get a column */ - col = *rp++ ; - /* add the column, if alive and untagged */ - col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && COL_IS_ALIVE (col)) - { - /* tag column in pivot row */ - Col [col].shared1.thickness = -col_thickness ; - ASSERT (pfree < Alen) ; - /* place column in pivot row */ - A [pfree++] = col ; - pivot_row_degree += col_thickness ; - } - } - } - - /* clear tag on pivot column */ - Col [pivot_col].shared1.thickness = pivot_col_thickness ; - max_deg = MAX (max_deg, pivot_row_degree) ; - -#ifndef NDEBUG - DEBUG3 (("check2\n")) ; - debug_mark (n_row, Row, tag_mark, max_mark) ; -#endif /* NDEBUG */ - - /* === Kill all rows used to construct pivot row ==================== */ - - /* also kill pivot row, temporarily */ - cp = &A [Col [pivot_col].start] ; - cp_end = cp + Col [pivot_col].length ; - while (cp < cp_end) - { - /* may be killing an already dead row */ - row = *cp++ ; - DEBUG3 (("Kill row in pivot col: %d\n", row)) ; - KILL_ROW (row) ; - } - - /* === Select a row index to use as the new pivot row =============== */ - - pivot_row_length = pfree - pivot_row_start ; - if (pivot_row_length > 0) - { - /* pick the "pivot" row arbitrarily (first row in col) */ - pivot_row = A [Col [pivot_col].start] ; - DEBUG3 (("Pivotal row is %d\n", pivot_row)) ; - } - else - { - /* there is no pivot row, since it is of zero length */ - pivot_row = EMPTY ; - ASSERT (pivot_row_length == 0) ; - } - ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; - - /* === Approximate degree computation =============================== */ - - /* Here begins the computation of the approximate degree. The column */ - /* score is the sum of the pivot row "length", plus the size of the */ - /* set differences of each row in the column minus the pattern of the */ - /* pivot row itself. The column ("thickness") itself is also */ - /* excluded from the column score (we thus use an approximate */ - /* external degree). */ - - /* The time taken by the following code (compute set differences, and */ - /* add them up) is proportional to the size of the data structure */ - /* being scanned - that is, the sum of the sizes of each column in */ - /* the pivot row. Thus, the amortized time to compute a column score */ - /* is proportional to the size of that column (where size, in this */ - /* context, is the column "length", or the number of row indices */ - /* in that column). The number of row indices in a column is */ - /* monotonically non-decreasing, from the length of the original */ - /* column on input to colamd. */ - - /* === Compute set differences ====================================== */ - - DEBUG3 (("** Computing set differences phase. **\n")) ; - - /* pivot row is currently dead - it will be revived later. */ - - DEBUG3 (("Pivot row: ")) ; - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - col = *rp++ ; - ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; - DEBUG3 (("Col: %d\n", col)) ; - - /* clear tags used to construct pivot row pattern */ - col_thickness = -Col [col].shared1.thickness ; - ASSERT (col_thickness > 0) ; - Col [col].shared1.thickness = col_thickness ; - - /* === Remove column from degree list =========================== */ - - cur_score = Col [col].shared2.score ; - prev_col = Col [col].shared3.prev ; - next_col = Col [col].shared4.degree_next ; - ASSERT (cur_score >= 0) ; - ASSERT (cur_score <= n_col) ; - ASSERT (cur_score >= EMPTY) ; - if (prev_col == EMPTY) - { - head [cur_score] = next_col ; - } - else - { - Col [prev_col].shared4.degree_next = next_col ; - } - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = prev_col ; - } - - /* === Scan the column ========================================== */ - - cp = &A [Col [col].start] ; - cp_end = cp + Col [col].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - row_mark = Row [row].shared2.mark ; - /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) - { - continue ; - } - ASSERT (row != pivot_row) ; - set_difference = row_mark - tag_mark ; - /* check if the row has been seen yet */ - if (set_difference < 0) - { - ASSERT (Row [row].shared1.degree <= max_deg) ; - set_difference = Row [row].shared1.degree ; - } - /* subtract column thickness from this row's set difference */ - set_difference -= col_thickness ; - ASSERT (set_difference >= 0) ; - /* absorb this row if the set difference becomes zero */ - if (set_difference == 0) - { - DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; - KILL_ROW (row) ; - } - else - { - /* save the new mark */ - Row [row].shared2.mark = set_difference + tag_mark ; - } - } - } - -#ifndef NDEBUG - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k-pivot_row_degree, max_deg) ; -#endif /* NDEBUG */ - - /* === Add up set differences for each column ======================= */ - - DEBUG3 (("** Adding set differences phase. **\n")) ; - - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - /* get a column */ - col = *rp++ ; - ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; - hash = 0 ; - cur_score = 0 ; - cp = &A [Col [col].start] ; - /* compact the column */ - new_cp = cp ; - cp_end = cp + Col [col].length ; - - DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ; - - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - ASSERT(row >= 0 && row < n_row) ; - row_mark = Row [row].shared2.mark ; - /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) - { - continue ; - } - ASSERT (row_mark > tag_mark) ; - /* compact the column */ - *new_cp++ = row ; - /* compute hash function */ - hash += row ; - /* add set difference */ - cur_score += row_mark - tag_mark ; - /* integer overflow... */ - cur_score = MIN (cur_score, n_col) ; - } - - /* recompute the column's length */ - Col [col].length = (int) (new_cp - &A [Col [col].start]) ; - - /* === Further mass elimination ================================= */ - - if (Col [col].length == 0) - { - DEBUG4 (("further mass elimination. Col: %d\n", col)) ; - /* nothing left but the pivot row in this column */ - KILL_PRINCIPAL_COL (col) ; - pivot_row_degree -= Col [col].shared1.thickness ; - ASSERT (pivot_row_degree >= 0) ; - /* order it */ - Col [col].shared2.order = k ; - /* increment order count by column thickness */ - k += Col [col].shared1.thickness ; - } - else - { - /* === Prepare for supercolumn detection ==================== */ - - DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ; - - /* save score so far */ - Col [col].shared2.score = cur_score ; - - /* add column to hash table, for supercolumn detection */ - hash %= n_col + 1 ; - - DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ; - ASSERT (hash <= n_col) ; - - head_column = head [hash] ; - if (head_column > EMPTY) - { - /* degree list "hash" is non-empty, use prev (shared3) of */ - /* first column in degree list as head of hash bucket */ - first_col = Col [head_column].shared3.headhash ; - Col [head_column].shared3.headhash = col ; - } - else - { - /* degree list "hash" is empty, use head as hash bucket */ - first_col = - (head_column + 2) ; - head [hash] = - (col + 2) ; - } - Col [col].shared4.hash_next = first_col ; - - /* save hash function in Col [col].shared3.hash */ - Col [col].shared3.hash = (int) hash ; - ASSERT (COL_IS_ALIVE (col)) ; - } - } - - /* The approximate external column degree is now computed. */ - - /* === Supercolumn detection ======================================== */ - - DEBUG3 (("** Supercolumn detection phase. **\n")) ; - - detect_super_cols ( - -#ifndef NDEBUG - n_col, Row, -#endif /* NDEBUG */ - - Col, A, head, pivot_row_start, pivot_row_length) ; - - /* === Kill the pivotal column ====================================== */ - - KILL_PRINCIPAL_COL (pivot_col) ; - - /* === Clear mark =================================================== */ - - tag_mark += (max_deg + 1) ; - if (tag_mark >= max_mark) - { - DEBUG2 (("clearing tag_mark\n")) ; - tag_mark = clear_mark (n_row, Row) ; - } - -#ifndef NDEBUG - DEBUG3 (("check3\n")) ; - debug_mark (n_row, Row, tag_mark, max_mark) ; -#endif /* NDEBUG */ - - /* === Finalize the new pivot row, and column scores ================ */ - - DEBUG3 (("** Finalize scores phase. **\n")) ; - - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - /* compact the pivot row */ - new_rp = rp ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - col = *rp++ ; - /* skip dead columns */ - if (COL_IS_DEAD (col)) - { - continue ; - } - *new_rp++ = col ; - /* add new pivot row to column */ - A [Col [col].start + (Col [col].length++)] = pivot_row ; - - /* retrieve score so far and add on pivot row's degree. */ - /* (we wait until here for this in case the pivot */ - /* row's degree was reduced due to mass elimination). */ - cur_score = Col [col].shared2.score + pivot_row_degree ; - - /* calculate the max possible score as the number of */ - /* external columns minus the 'k' value minus the */ - /* columns thickness */ - max_score = n_col - k - Col [col].shared1.thickness ; - - /* make the score the external degree of the union-of-rows */ - cur_score -= Col [col].shared1.thickness ; - - /* make sure score is less or equal than the max score */ - cur_score = MIN (cur_score, max_score) ; - ASSERT (cur_score >= 0) ; - - /* store updated score */ - Col [col].shared2.score = cur_score ; - - /* === Place column back in degree list ========================= */ - - ASSERT (min_score >= 0) ; - ASSERT (min_score <= n_col) ; - ASSERT (cur_score >= 0) ; - ASSERT (cur_score <= n_col) ; - ASSERT (head [cur_score] >= EMPTY) ; - next_col = head [cur_score] ; - Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = EMPTY ; - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = col ; - } - head [cur_score] = col ; - - /* see if this score is less than current min */ - min_score = MIN (min_score, cur_score) ; - - } - -#ifndef NDEBUG - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k, max_deg) ; -#endif /* NDEBUG */ - - /* === Resurrect the new pivot row ================================== */ - - if (pivot_row_degree > 0) - { - /* update pivot row length to reflect any cols that were killed */ - /* during super-col detection and mass elimination */ - Row [pivot_row].start = pivot_row_start ; - Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ; - Row [pivot_row].shared1.degree = pivot_row_degree ; - Row [pivot_row].shared2.mark = 0 ; - /* pivot row is no longer dead */ - } - } - - /* === All principal columns have now been ordered ====================== */ - - return (ngarbage) ; -} - - -/* ========================================================================== */ -/* === order_children ======================================================= */ -/* ========================================================================== */ - -/* - The find_ordering routine has ordered all of the principal columns (the - representatives of the supercolumns). The non-principal columns have not - yet been ordered. This routine orders those columns by walking up the - parent tree (a column is a child of the column which absorbed it). The - final permutation vector is then placed in p [0 ... n_col-1], with p [0] - being the first column, and p [n_col-1] being the last. It doesn't look - like it at first glance, but be assured that this routine takes time linear - in the number of columns. Although not immediately obvious, the time - taken by this routine is O (n_col), that is, linear in the number of - columns. Not user-callable. -*/ - -PRIVATE void order_children -( - /* === Parameters ======================================================= */ - - int n_col, /* number of columns of A */ - Colamd_Col Col [], /* of size n_col+1 */ - int p [] /* p [0 ... n_col-1] is the column permutation*/ -) -{ - /* === Local variables ================================================== */ - - int i ; /* loop counter for all columns */ - int c ; /* column index */ - int parent ; /* index of column's parent */ - int order ; /* column's order */ - - /* === Order each non-principal column ================================== */ - - for (i = 0 ; i < n_col ; i++) - { - /* find an un-ordered non-principal column */ - ASSERT (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY) - { - parent = i ; - /* once found, find its principal parent */ - do - { - parent = Col [parent].shared1.parent ; - } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; - - /* now, order all un-ordered non-principal columns along path */ - /* to this parent. collapse tree at the same time */ - c = i ; - /* get order of parent */ - order = Col [parent].shared2.order ; - - do - { - ASSERT (Col [c].shared2.order == EMPTY) ; - - /* order this column */ - Col [c].shared2.order = order++ ; - /* collaps tree */ - Col [c].shared1.parent = parent ; - - /* get immediate parent of this column */ - c = Col [c].shared1.parent ; - - /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ - /* above an ordered column */ - } while (Col [c].shared2.order == EMPTY) ; - - /* re-order the super_col parent to largest order for this group */ - Col [parent].shared2.order = order ; - } - } - - /* === Generate the permutation ========================================= */ - - for (c = 0 ; c < n_col ; c++) - { - p [Col [c].shared2.order] = c ; - } -} - - -/* ========================================================================== */ -/* === detect_super_cols ==================================================== */ -/* ========================================================================== */ - -/* - Detects supercolumns by finding matches between columns in the hash buckets. - Check amongst columns in the set A [row_start ... row_start + row_length-1]. - The columns under consideration are currently *not* in the degree lists, - and have already been placed in the hash buckets. - - The hash bucket for columns whose hash function is equal to h is stored - as follows: - - if head [h] is >= 0, then head [h] contains a degree list, so: - - head [h] is the first column in degree bucket h. - Col [head [h]].headhash gives the first column in hash bucket h. - - otherwise, the degree list is empty, and: - - -(head [h] + 2) is the first column in hash bucket h. - - For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous - column" pointer. Col [c].shared3.hash is used instead as the hash number - for that column. The value of Col [c].shared4.hash_next is the next column - in the same hash bucket. - - Assuming no, or "few" hash collisions, the time taken by this routine is - linear in the sum of the sizes (lengths) of each column whose score has - just been computed in the approximate degree computation. - Not user-callable. -*/ - -PRIVATE void detect_super_cols -( - /* === Parameters ======================================================= */ - -#ifndef NDEBUG - /* these two parameters are only needed when debugging is enabled: */ - int n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ -#endif /* NDEBUG */ - - Colamd_Col Col [], /* of size n_col+1 */ - int A [], /* row indices of A */ - int head [], /* head of degree lists and hash buckets */ - int row_start, /* pointer to set of columns to check */ - int row_length /* number of columns to check */ -) -{ - /* === Local variables ================================================== */ - - int hash ; /* hash value for a column */ - int *rp ; /* pointer to a row */ - int c ; /* a column index */ - int super_c ; /* column index of the column to absorb into */ - int *cp1 ; /* column pointer for column super_c */ - int *cp2 ; /* column pointer for column c */ - int length ; /* length of column super_c */ - int prev_c ; /* column preceding c in hash bucket */ - int i ; /* loop counter */ - int *rp_end ; /* pointer to the end of the row */ - int col ; /* a column index in the row to check */ - int head_column ; /* first column in hash bucket or degree list */ - int first_col ; /* first column in hash bucket */ - - /* === Consider each column in the row ================================== */ - - rp = &A [row_start] ; - rp_end = rp + row_length ; - while (rp < rp_end) - { - col = *rp++ ; - if (COL_IS_DEAD (col)) - { - continue ; - } - - /* get hash number for this column */ - hash = Col [col].shared3.hash ; - ASSERT (hash <= n_col) ; - - /* === Get the first column in this hash bucket ===================== */ - - head_column = head [hash] ; - if (head_column > EMPTY) - { - first_col = Col [head_column].shared3.headhash ; - } - else - { - first_col = - (head_column + 2) ; - } - - /* === Consider each column in the hash bucket ====================== */ - - for (super_c = first_col ; super_c != EMPTY ; - super_c = Col [super_c].shared4.hash_next) - { - ASSERT (COL_IS_ALIVE (super_c)) ; - ASSERT (Col [super_c].shared3.hash == hash) ; - length = Col [super_c].length ; - - /* prev_c is the column preceding column c in the hash bucket */ - prev_c = super_c ; - - /* === Compare super_c with all columns after it ================ */ - - for (c = Col [super_c].shared4.hash_next ; - c != EMPTY ; c = Col [c].shared4.hash_next) - { - ASSERT (c != super_c) ; - ASSERT (COL_IS_ALIVE (c)) ; - ASSERT (Col [c].shared3.hash == hash) ; - - /* not identical if lengths or scores are different */ - if (Col [c].length != length || - Col [c].shared2.score != Col [super_c].shared2.score) - { - prev_c = c ; - continue ; - } - - /* compare the two columns */ - cp1 = &A [Col [super_c].start] ; - cp2 = &A [Col [c].start] ; - - for (i = 0 ; i < length ; i++) - { - /* the columns are "clean" (no dead rows) */ - ASSERT (ROW_IS_ALIVE (*cp1)) ; - ASSERT (ROW_IS_ALIVE (*cp2)) ; - /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ - if (*cp1++ != *cp2++) - { - break ; - } - } - - /* the two columns are different if the for-loop "broke" */ - if (i != length) - { - prev_c = c ; - continue ; - } - - /* === Got it! two columns are identical =================== */ - - ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ; - - Col [super_c].shared1.thickness += Col [c].shared1.thickness ; - Col [c].shared1.parent = super_c ; - KILL_NON_PRINCIPAL_COL (c) ; - /* order c later, in order_children() */ - Col [c].shared2.order = EMPTY ; - /* remove c from hash bucket */ - Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; - } - } - - /* === Empty this hash bucket ======================================= */ - - if (head_column > EMPTY) - { - /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = EMPTY ; - } - else - { - /* corresponding degree list "hash" is empty */ - head [hash] = EMPTY ; - } - } -} - - -/* ========================================================================== */ -/* === garbage_collection =================================================== */ -/* ========================================================================== */ - -/* - Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns - the index of the first free position in A, after garbage collection. The - time taken by this routine is linear is the size of the array A, which is - itself linear in the number of nonzeros in the input matrix. - Not user-callable. -*/ - -PRIVATE int garbage_collection /* returns the new value of pfree */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows */ - int n_col, /* number of columns */ - Colamd_Row Row [], /* row info */ - Colamd_Col Col [], /* column info */ - int A [], /* A [0 ... Alen-1] holds the matrix */ - int *pfree /* &A [0] ... pfree is in use */ -) -{ - /* === Local variables ================================================== */ - - int *psrc ; /* source pointer */ - int *pdest ; /* destination pointer */ - int j ; /* counter */ - int r ; /* a row index */ - int c ; /* a column index */ - int length ; /* length of a row or column */ - -#ifndef NDEBUG - int debug_rows ; - DEBUG2 (("Defrag..\n")) ; - for (psrc = &A[0] ; psrc < pfree ; psrc++) ASSERT (*psrc >= 0) ; - debug_rows = 0 ; -#endif /* NDEBUG */ - - /* === Defragment the columns =========================================== */ - - pdest = &A[0] ; - for (c = 0 ; c < n_col ; c++) - { - if (COL_IS_ALIVE (c)) - { - psrc = &A [Col [c].start] ; - - /* move and compact the column */ - ASSERT (pdest <= psrc) ; - Col [c].start = (int) (pdest - &A [0]) ; - length = Col [c].length ; - for (j = 0 ; j < length ; j++) - { - r = *psrc++ ; - if (ROW_IS_ALIVE (r)) - { - *pdest++ = r ; - } - } - Col [c].length = (int) (pdest - &A [Col [c].start]) ; - } - } - - /* === Prepare to defragment the rows =================================== */ - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - if (Row [r].length == 0) - { - /* this row is of zero length. cannot compact it, so kill it */ - DEBUG3 (("Defrag row kill\n")) ; - KILL_ROW (r) ; - } - else - { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - ASSERT (ROW_IS_ALIVE (r)) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ONES_COMPLEMENT (r) ; - -#ifndef NDEBUG - debug_rows++ ; -#endif /* NDEBUG */ - - } - } - } - - /* === Defragment the rows ============================================== */ - - psrc = pdest ; - while (psrc < pfree) - { - /* find a negative number ... the start of a row */ - if (*psrc++ < 0) - { - psrc-- ; - /* get the row index */ - r = ONES_COMPLEMENT (*psrc) ; - ASSERT (r >= 0 && r < n_row) ; - /* restore first column index */ - *psrc = Row [r].shared2.first_column ; - ASSERT (ROW_IS_ALIVE (r)) ; - - /* move and compact the row */ - ASSERT (pdest <= psrc) ; - Row [r].start = (int) (pdest - &A [0]) ; - length = Row [r].length ; - for (j = 0 ; j < length ; j++) - { - c = *psrc++ ; - if (COL_IS_ALIVE (c)) - { - *pdest++ = c ; - } - } - Row [r].length = (int) (pdest - &A [Row [r].start]) ; - -#ifndef NDEBUG - debug_rows-- ; -#endif /* NDEBUG */ - - } - } - /* ensure we found all the rows */ - ASSERT (debug_rows == 0) ; - - /* === Return the new value of pfree ==================================== */ - - return ((int) (pdest - &A [0])) ; -} - - -/* ========================================================================== */ -/* === clear_mark =========================================================== */ -/* ========================================================================== */ - -/* - Clears the Row [].shared2.mark array, and returns the new tag_mark. - Return value is the new tag_mark. Not user-callable. -*/ - -PRIVATE int clear_mark /* return the new value for tag_mark */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows in A */ - Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ -) -{ - /* === Local variables ================================================== */ - - int r ; - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - Row [r].shared2.mark = 0 ; - } - } - return (1) ; -} - - -/* ========================================================================== */ -/* === print_report ========================================================= */ -/* ========================================================================== */ - -PRIVATE void print_report -( - const char *method, - int stats [COLAMD_STATS] -) -{ - - int i1, i2, i3 ; - - if (!stats) - { - PRINTF ("%s: No statistics available.\n", method) ; - return ; - } - - i1 = stats [COLAMD_INFO1] ; - i2 = stats [COLAMD_INFO2] ; - i3 = stats [COLAMD_INFO3] ; - - if (stats [COLAMD_STATUS] >= 0) - { - PRINTF ("%s: OK. ", method) ; - } - else - { - PRINTF ("%s: ERROR. ", method) ; - } - - switch (stats [COLAMD_STATUS]) - { - - case COLAMD_OK_BUT_JUMBLED: - - PRINTF ("Matrix has unsorted or duplicate row indices.\n") ; - - PRINTF ("%s: number of duplicate or out-of-order row indices: %d\n", - method, i3) ; - - PRINTF ("%s: last seen duplicate or out-of-order row index: %d\n", - method, INDEX (i2)) ; - - PRINTF ("%s: last seen in column: %d", - method, INDEX (i1)) ; - - /* no break - fall through to next case instead */ - - case COLAMD_OK: - - PRINTF ("\n") ; - - PRINTF ("%s: number of dense or empty rows ignored: %d\n", - method, stats [COLAMD_DENSE_ROW]) ; - - PRINTF ("%s: number of dense or empty columns ignored: %d\n", - method, stats [COLAMD_DENSE_COL]) ; - - PRINTF ("%s: number of garbage collections performed: %d\n", - method, stats [COLAMD_DEFRAG_COUNT]) ; - break ; - - case COLAMD_ERROR_A_not_present: - - PRINTF ("Array A (row indices of matrix) not present.\n") ; - break ; - - case COLAMD_ERROR_p_not_present: - - PRINTF ("Array p (column pointers for matrix) not present.\n") ; - break ; - - case COLAMD_ERROR_nrow_negative: - - PRINTF ("Invalid number of rows (%d).\n", i1) ; - break ; - - case COLAMD_ERROR_ncol_negative: - - PRINTF ("Invalid number of columns (%d).\n", i1) ; - break ; - - case COLAMD_ERROR_nnz_negative: - - PRINTF ("Invalid number of nonzero entries (%d).\n", i1) ; - break ; - - case COLAMD_ERROR_p0_nonzero: - - PRINTF ("Invalid column pointer, p [0] = %d, must be zero.\n", i1) ; - break ; - - case COLAMD_ERROR_A_too_small: - - PRINTF ("Array A too small.\n") ; - PRINTF (" Need Alen >= %d, but given only Alen = %d.\n", - i1, i2) ; - break ; - - case COLAMD_ERROR_col_length_negative: - - PRINTF - ("Column %d has a negative number of nonzero entries (%d).\n", - INDEX (i1), i2) ; - break ; - - case COLAMD_ERROR_row_index_out_of_bounds: - - PRINTF - ("Row index (row %d) out of bounds (%d to %d) in column %d.\n", - INDEX (i2), INDEX (0), INDEX (i3-1), INDEX (i1)) ; - break ; - - case COLAMD_ERROR_out_of_memory: - - PRINTF ("Out of memory.\n") ; - break ; - - case COLAMD_ERROR_internal_error: - - /* if this happens, there is a bug in the code */ - PRINTF - ("Internal error! Please contact authors (davis@cise.ufl.edu).\n") ; - break ; - } -} - - - - -/* ========================================================================== */ -/* === colamd debugging routines ============================================ */ -/* ========================================================================== */ - -/* When debugging is disabled, the remainder of this file is ignored. */ - -#ifndef NDEBUG - - -/* ========================================================================== */ -/* === debug_structures ===================================================== */ -/* ========================================================================== */ - -/* - At this point, all empty rows and columns are dead. All live columns - are "clean" (containing no dead rows) and simplicial (no supercolumns - yet). Rows may contain dead columns, but all live rows contain at - least one live column. -*/ - -PRIVATE void debug_structures -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [], - int n_col2 -) -{ - /* === Local variables ================================================== */ - - int i ; - int c ; - int *cp ; - int *cp_end ; - int len ; - int score ; - int r ; - int *rp ; - int *rp_end ; - int deg ; - - /* === Check A, Row, and Col ============================================ */ - - for (c = 0 ; c < n_col ; c++) - { - if (COL_IS_ALIVE (c)) - { - len = Col [c].length ; - score = Col [c].shared2.score ; - DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ; - ASSERT (len > 0) ; - ASSERT (score >= 0) ; - ASSERT (Col [c].shared1.thickness == 1) ; - cp = &A [Col [c].start] ; - cp_end = cp + len ; - while (cp < cp_end) - { - r = *cp++ ; - ASSERT (ROW_IS_ALIVE (r)) ; - } - } - else - { - i = Col [c].shared2.order ; - ASSERT (i >= n_col2 && i < n_col) ; - } - } - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - i = 0 ; - len = Row [r].length ; - deg = Row [r].shared1.degree ; - ASSERT (len > 0) ; - ASSERT (deg > 0) ; - rp = &A [Row [r].start] ; - rp_end = rp + len ; - while (rp < rp_end) - { - c = *rp++ ; - if (COL_IS_ALIVE (c)) - { - i++ ; - } - } - ASSERT (i > 0) ; - } - } -} - - -/* ========================================================================== */ -/* === debug_deg_lists ====================================================== */ -/* ========================================================================== */ - -/* - Prints the contents of the degree lists. Counts the number of columns - in the degree list and compares it to the total it should have. Also - checks the row degrees. -*/ - -PRIVATE void debug_deg_lists -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int head [], - int min_score, - int should, - int max_deg -) -{ - /* === Local variables ================================================== */ - - int deg ; - int col ; - int have ; - int row ; - - /* === Check the degree lists =========================================== */ - - if (n_col > 10000 && colamd_debug <= 0) - { - return ; - } - have = 0 ; - DEBUG4 (("Degree lists: %d\n", min_score)) ; - for (deg = 0 ; deg <= n_col ; deg++) - { - col = head [deg] ; - if (col == EMPTY) - { - continue ; - } - DEBUG4 (("%d:", deg)) ; - while (col != EMPTY) - { - DEBUG4 ((" %d", col)) ; - have += Col [col].shared1.thickness ; - ASSERT (COL_IS_ALIVE (col)) ; - col = Col [col].shared4.degree_next ; - } - DEBUG4 (("\n")) ; - } - DEBUG4 (("should %d have %d\n", should, have)) ; - ASSERT (should == have) ; - - /* === Check the row degrees ============================================ */ - - if (n_row > 10000 && colamd_debug <= 0) - { - return ; - } - for (row = 0 ; row < n_row ; row++) - { - if (ROW_IS_ALIVE (row)) - { - ASSERT (Row [row].shared1.degree <= max_deg) ; - } - } -} - - -/* ========================================================================== */ -/* === debug_mark =========================================================== */ -/* ========================================================================== */ - -/* - Ensures that the tag_mark is less that the maximum and also ensures that - each entry in the mark array is less than the tag mark. -*/ - -PRIVATE void debug_mark -( - /* === Parameters ======================================================= */ - - int n_row, - Colamd_Row Row [], - int tag_mark, - int max_mark -) -{ - /* === Local variables ================================================== */ - - int r ; - - /* === Check the Row marks ============================================== */ - - ASSERT (tag_mark > 0 && tag_mark <= max_mark) ; - if (n_row > 10000 && colamd_debug <= 0) - { - return ; - } - for (r = 0 ; r < n_row ; r++) - { - ASSERT (Row [r].shared2.mark < tag_mark) ; - } -} - - -/* ========================================================================== */ -/* === debug_matrix ========================================================= */ -/* ========================================================================== */ - -/* - Prints out the contents of the columns and the rows. -*/ - -PRIVATE void debug_matrix -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - Colamd_Row Row [], - Colamd_Col Col [], - int A [] -) -{ - /* === Local variables ================================================== */ - - int r ; - int c ; - int *rp ; - int *rp_end ; - int *cp ; - int *cp_end ; - - /* === Dump the rows and columns of the matrix ========================== */ - - if (colamd_debug < 3) - { - return ; - } - DEBUG3 (("DUMP MATRIX:\n")) ; - for (r = 0 ; r < n_row ; r++) - { - DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ; - if (ROW_IS_DEAD (r)) - { - continue ; - } - DEBUG3 (("start %d length %d degree %d\n", - Row [r].start, Row [r].length, Row [r].shared1.degree)) ; - rp = &A [Row [r].start] ; - rp_end = rp + Row [r].length ; - while (rp < rp_end) - { - c = *rp++ ; - DEBUG4 ((" %d col %d\n", COL_IS_ALIVE (c), c)) ; - } - } - - for (c = 0 ; c < n_col ; c++) - { - DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ; - if (COL_IS_DEAD (c)) - { - continue ; - } - DEBUG3 (("start %d length %d shared1 %d shared2 %d\n", - Col [c].start, Col [c].length, - Col [c].shared1.thickness, Col [c].shared2.score)) ; - cp = &A [Col [c].start] ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - r = *cp++ ; - DEBUG4 ((" %d row %d\n", ROW_IS_ALIVE (r), r)) ; - } - } -} - -PRIVATE void colamd_get_debug -( - char *method -) -{ - colamd_debug = 0 ; /* no debug printing */ - - /* get "D" environment variable, which gives the debug printing level */ - if (getenv ("D")) - { - colamd_debug = atoi (getenv ("D")) ; - } - - DEBUG0 (("%s: debug version, D = %d (THIS WILL BE SLOW!)\n", - method, colamd_debug)) ; -} - -#endif /* NDEBUG */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,246 +0,0 @@ -/* ========================================================================== */ -/* === colamd/symamd prototypes and definitions ============================= */ -/* ========================================================================== */ - -/* - You must include this file (colamd.h) in any routine that uses colamd, - symamd, or the related macros and definitions. - - Authors: - - The authors of the code itself are Stefan I. Larimore and Timothy A. - Davis (davis@cise.ufl.edu), University of Florida. The algorithm was - developed in collaboration with John Gilbert, Xerox PARC, and Esmond - Ng, Oak Ridge National Laboratory. - - Date: - - September 8, 2003. Version 2.3. - - Acknowledgements: - - This work was supported by the National Science Foundation, under - grants DMS-9504974 and DMS-9803599. - - Notice: - - Copyright (c) 1998-2003 by the University of Florida. - All Rights Reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use, copy, modify, and/or distribute - this program, provided that the Copyright, this License, and the - Availability of the original version is retained on all copies and made - accessible to the end-user of any code or package that includes COLAMD - or any modified version of COLAMD. - - Availability: - - The colamd/symamd library is available at - - http://www.cise.ufl.edu/research/sparse/colamd/ - - This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h - file. It is required by the colamd.c, colamdmex.c, and symamdmex.c - files, and by any C code that calls the routines whose prototypes are - listed below, or that uses the colamd/symamd definitions listed below. - -*/ - -#ifndef COLAMD_H -#define COLAMD_H - -/* ========================================================================== */ -/* === Include files ======================================================== */ -/* ========================================================================== */ - -#include - -/* ========================================================================== */ -/* === Knob and statistics definitions ====================================== */ -/* ========================================================================== */ - -/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -#define COLAMD_KNOBS 20 - -/* number of output statistics. Only stats [0..6] are currently used. */ -#define COLAMD_STATS 20 - -/* knobs [0] and stats [0]: dense row knob and output statistic. */ -#define COLAMD_DENSE_ROW 0 - -/* knobs [1] and stats [1]: dense column knob and output statistic. */ -#define COLAMD_DENSE_COL 1 - -/* stats [2]: memory defragmentation count output statistic */ -#define COLAMD_DEFRAG_COUNT 2 - -/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ -#define COLAMD_STATUS 3 - -/* stats [4..6]: error info, or info on jumbled columns */ -#define COLAMD_INFO1 4 -#define COLAMD_INFO2 5 -#define COLAMD_INFO3 6 - -/* error codes returned in stats [3]: */ -#define COLAMD_OK (0) -#define COLAMD_OK_BUT_JUMBLED (1) -#define COLAMD_ERROR_A_not_present (-1) -#define COLAMD_ERROR_p_not_present (-2) -#define COLAMD_ERROR_nrow_negative (-3) -#define COLAMD_ERROR_ncol_negative (-4) -#define COLAMD_ERROR_nnz_negative (-5) -#define COLAMD_ERROR_p0_nonzero (-6) -#define COLAMD_ERROR_A_too_small (-7) -#define COLAMD_ERROR_col_length_negative (-8) -#define COLAMD_ERROR_row_index_out_of_bounds (-9) -#define COLAMD_ERROR_out_of_memory (-10) -#define COLAMD_ERROR_internal_error (-999) - -/* ========================================================================== */ -/* === Row and Column structures ============================================ */ -/* ========================================================================== */ - -/* User code that makes use of the colamd/symamd routines need not directly */ -/* reference these structures. They are used only for the COLAMD_RECOMMENDED */ -/* macro. */ - -typedef struct Colamd_Col_struct -{ - int start ; /* index for A of first row in this column, or DEAD */ - /* if column is dead */ - int length ; /* number of rows in this column */ - union - { - int thickness ; /* number of original columns represented by this */ - /* col, if the column is alive */ - int parent ; /* parent in parent tree super-column structure, if */ - /* the column is dead */ - } shared1 ; - union - { - int score ; /* the score used to maintain heap, if col is alive */ - int order ; /* pivot ordering of this column, if col is dead */ - } shared2 ; - union - { - int headhash ; /* head of a hash bucket, if col is at the head of */ - /* a degree list */ - int hash ; /* hash value, if col is not in a degree list */ - int prev ; /* previous column in degree list, if col is in a */ - /* degree list (but not at the head of a degree list) */ - } shared3 ; - union - { - int degree_next ; /* next column, if col is in a degree list */ - int hash_next ; /* next column, if col is in a hash list */ - } shared4 ; - -} Colamd_Col ; - -typedef struct Colamd_Row_struct -{ - int start ; /* index for A of first col in this row */ - int length ; /* number of principal columns in this row */ - union - { - int degree ; /* number of principal & non-principal columns in row */ - int p ; /* used as a row pointer in init_rows_cols () */ - } shared1 ; - union - { - int mark ; /* for computing set differences and marking dead rows*/ - int first_column ;/* first column in row (used in garbage collection) */ - } shared2 ; - -} Colamd_Row ; - -/* ========================================================================== */ -/* === Colamd recommended memory size ======================================= */ -/* ========================================================================== */ - -/* - The recommended length Alen of the array A passed to colamd is given by - the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any - argument is negative. 2*nnz space is required for the row and column - indices of the matrix. COLAMD_C (n_col) + COLAMD_R (n_row) space is - required for the Col and Row arrays, respectively, which are internal to - colamd. An additional n_col space is the minimal amount of "elbow room", - and nnz/5 more space is recommended for run time efficiency. - - This macro is not needed when using symamd. - - Explicit typecast to int added Sept. 23, 2002, COLAMD version 2.2, to avoid - gcc -pedantic warning messages. -*/ - -#define COLAMD_C(n_col) ((int) (((n_col) + 1) * sizeof (Colamd_Col) / sizeof (int))) -#define COLAMD_R(n_row) ((int) (((n_row) + 1) * sizeof (Colamd_Row) / sizeof (int))) - -#define COLAMD_RECOMMENDED(nnz, n_row, n_col) \ -( \ -((nnz) < 0 || (n_row) < 0 || (n_col) < 0) \ -? \ - (-1) \ -: \ - (2 * (nnz) + COLAMD_C (n_col) + COLAMD_R (n_row) + (n_col) + ((nnz) / 5)) \ -) - -/* ========================================================================== */ -/* === Prototypes of user-callable routines ================================= */ -/* ========================================================================== */ - -int colamd_recommended /* returns recommended value of Alen, */ - /* or (-1) if input arguments are erroneous */ -( - int nnz, /* nonzeros in A */ - int n_row, /* number of rows in A */ - int n_col /* number of columns in A */ -) ; - -void colamd_set_defaults /* sets default parameters */ -( /* knobs argument is modified on output */ - double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ -) ; - -int colamd /* returns (1) if successful, (0) otherwise*/ -( /* A and p arguments are modified on output */ - int n_row, /* number of rows in A */ - int n_col, /* number of columns in A */ - int Alen, /* size of the array A */ - int A [], /* row indices of A, of size Alen */ - int p [], /* column pointers of A, of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameter settings for colamd */ - int stats [COLAMD_STATS] /* colamd output statistics and error codes */ -) ; - -int symamd /* return (1) if OK, (0) otherwise */ -( - int n, /* number of rows and columns of A */ - int A [], /* row indices of A */ - int p [], /* column pointers of A */ - int perm [], /* output permutation, size n_col+1 */ - double knobs [COLAMD_KNOBS], /* parameters (uses defaults if NULL) */ - int stats [COLAMD_STATS], /* output statistics and error codes */ - void * (*allocate) (size_t, size_t), - /* pointer to calloc (ANSI C) or */ - /* mxCalloc (for MATLAB mexFunction) */ - void (*release) (void *) - /* pointer to free (ANSI C) or */ - /* mxFree (for MATLAB mexFunction) */ -) ; - -void colamd_report -( - int stats [COLAMD_STATS] -) ; - -void symamd_report -( - int stats [COLAMD_STATS] -) ; - -#endif /* COLAMD_H */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,478 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_cdefs.h" - -/* - * Function prototypes - */ -void clsolve(int, int, complex *, complex *); -void cmatvec(int, int, int, complex *, complex *, complex *); -extern void ccheck_tempv(); - -void -cpanel_bmod ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - const int nseg, /* in */ - complex *dense, /* out, of size n by w */ - complex *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in, of size n by w */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * - * Performs numeric block updates (sup-panel) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - * Before entering this routine, the original nonzeros in the panel - * were already copied into the spa[m,w]. - * - * Updated/Output parameters- - * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned - * collectively in the m-by-w vector dense[*]. - * - */ - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - complex alpha, beta; -#endif - - register int k, ksub; - int fsupc, nsupc, nsupr, nrow; - int krep, krep_ind; - complex ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int segsze; - int block_nrow; /* no of rows in a block row */ - register int lptr; /* Points to the row subscripts of a supernode */ - int kfnz, irow, no_zeros; - register int isub, isub1, i; - register int jj; /* Index through each column in the panel */ - int *xsup, *supno; - int *lsub, *xlsub; - complex *lusup; - int *xlusup; - int *repfnz_col; /* repfnz[] for a column in the panel */ - complex *dense_col; /* dense[] for a column in the panel */ - complex *tempv1; /* Used in 1-D update */ - complex *TriTmp, *MatvecTmp; /* used in 2-D update */ - complex zero = {0.0, 0.0}; - complex one = {1.0, 0.0}; - complex comp_temp, comp_temp1; - register int ldaTmp; - register int r_ind, r_hi; - static int first = 1, maxsuper, rowblk, colblk; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - if ( first ) { - maxsuper = sp_ienv(3); - rowblk = sp_ienv(4); - colblk = sp_ienv(5); - first = 0; - } - ldaTmp = maxsuper + rowblk; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in a supernode - * nsupr = no of rows in a supernode - */ - krep = segrep[k--]; - fsupc = xsup[supno[krep]]; - nsupc = krep - fsupc + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nrow = nsupr - nsupc; - lptr = xlsub[fsupc]; - krep_ind = lptr + nsupc - 1; - - repfnz_col = repfnz; - dense_col = dense; - - if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ - - TriTmp = tempv; - - /* Sequence through each column in panel -- triangular solves */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += 4 * segsze * (segsze - 1); - ops[GEMV] += 8 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); - c_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); - c_sub(&ukj1, &ukj1, &comp_temp); - - cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); - cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; luptr2++; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } - - } else { /* segsze >= 4 */ - - /* Copy U[*,j] segment from dense[*] to TriTmp[*], which - holds the result of triangular solves. */ - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - TriTmp[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#else - ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#endif -#else - clsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); -#endif - - - } /* else ... */ - - } /* for jj ... end tri-solves */ - - /* Block row updates; push all the way into dense[*] block */ - for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { - - r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); - block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); - luptr = xlusup[fsupc] + nsupc + r_ind; - isub1 = lptr + nsupc + r_ind; - - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - /* Sequence through each column in panel -- matrix-vector */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - /* Perform a block update, and scatter the result of - matrix-vector to dense[]. */ - no_zeros = kfnz - fsupc; - luptr1 = luptr + nsupr * no_zeros; - MatvecTmp = &TriTmp[maxsuper]; - -#ifdef USE_VENDOR_BLAS - alpha = one; - beta = zero; -#ifdef _CRAY - CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#else - cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#endif -#else - cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], - TriTmp, MatvecTmp); -#endif - - /* Scatter MatvecTmp[*] into SPA dense[*] temporarily - * such that MatvecTmp[*] can be re-used for the - * the next blok row update. dense[] will be copied into - * global store after the whole panel has been finished. - */ - isub = isub1; - for (i = 0; i < block_nrow; i++) { - irow = lsub[isub]; - c_sub(&dense_col[irow], &dense_col[irow], - &MatvecTmp[i]); - MatvecTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } /* for each block row ... */ - - /* Scatter the triangular solves into SPA dense[*] */ - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = TriTmp[i]; - TriTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } else { /* 1-D block modification */ - - - /* Sequence through each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += 4 * segsze * (segsze - 1); - ops[GEMV] += 8 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); - c_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); - c_sub(&ukj1, &ukj1, &comp_temp); - - cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); - cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; ++luptr2; - cc_mult(&comp_temp, &ukj, &lusup[luptr]); - cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - c_add(&comp_temp, &comp_temp, &comp_temp1); - c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } - - } else { /* segsze >= 4 */ - /* - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense[]. - */ - no_zeros = kfnz - fsupc; - - /* Copy U[*,j] segment from dense[*] to tempv[*]: - * The result of triangular solve is in tempv[*]; - * The result of matrix vector update is in dense_col[*] - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - tempv[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#endif - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - clsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); -#endif - - /* Scatter tempv[*] into SPA dense[*] temporarily, such - * that tempv[*] can be used for the triangular solve of - * the next column of the panel. They will be copied into - * ucol[*] after the whole panel has been finished. - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = tempv[i]; - tempv[i] = zero; - isub++; - } - - /* Scatter the update from tempv1[*] into SPA dense[*] */ - /* Start dense rectangular L */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); - tempv1[i] = zero; - ++isub; - } - - } /* else segsze>=4 ... */ - - } /* for each column in the panel... */ - - } /* else 1-D update ... */ - - } /* for each updating supernode ... */ - -} - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,247 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - -void -cpanel_dfs ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - SuperMatrix *A, /* in - original matrix */ - int *perm_r, /* in */ - int *nseg, /* out */ - complex *dense, /* out */ - int *panel_lsub, /* out */ - int *segrep, /* out */ - int *repfnz, /* out */ - int *xprune, /* out */ - int *marker, /* out */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * - * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. - * - * The routine returns one list of the supernodal representatives - * in topological order of the dfs that generates them. This list is - * a superset of the topological order of each individual column within - * the panel. - * The location of the first nonzero in each supernodal segment - * (supernodal entry location) is also returned. Each column has a - * separate list for this purpose. - * - * Two marker arrays are used for dfs: - * marker[i] == jj, if i was visited during dfs of current column jj; - * marker1[i] >= jcol, if i was visited by earlier columns in this panel; - * - * marker: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - */ - NCPformat *Astore; - complex *a; - int *asub; - int *xa_begin, *xa_end; - int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; - int k, krow, kmark, kperm; - int xdfs, maxdfs, kpar; - int jj; /* index through each column in the panel */ - int *marker1; /* marker1[jj] >= jcol if vertex jj was visited - by a previous column within this panel. */ - int *repfnz_col; /* start of each column in the panel */ - complex *dense_col; /* start of each column in the panel */ - int nextl_col; /* next available position in panel_lsub[*,jj] */ - int *xsup, *supno; - int *lsub, *xlsub; - - /* Initialize pointers */ - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - marker1 = marker + m; - repfnz_col = repfnz; - dense_col = dense; - *nseg = 0; - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - - /* For each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++) { - nextl_col = (jj - jcol) * m; - -#ifdef CHK_DFS - printf("\npanel col %d: ", jj); -#endif - - /* For each nonz in A[*,jj] do dfs */ - for (k = xa_begin[jj]; k < xa_end[jj]; k++) { - krow = asub[k]; - dense_col[krow] = a[k]; - kmark = marker[krow]; - if ( kmark == jj ) - continue; /* krow visited before, go to the next nonzero */ - - /* For each unmarked nbr krow of jj - * krow is in L: place it in structure of L[*,jj] - */ - marker[krow] = jj; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ - } - /* - * krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - else { - - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz_col[krep]; - -#ifdef CHK_DFS - printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); -#endif - if ( myfnz != EMPTY ) { /* Representative visited before */ - if ( myfnz > kperm ) repfnz_col[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz_col[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker[kchild]; - - if ( chmark != jj ) { /* Not reached yet */ - marker[kchild] = jj; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,j] */ - if ( chperm == EMPTY ) { - panel_lsub[nextl_col++] = kchild; - } - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - else { - - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz_col[chrep]; -#ifdef CHK_DFS - printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); -#endif - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz_col[chrep] = chperm; - } - else { - /* Cont. dfs at snode-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L) */ - parent[krep] = oldrep; - repfnz_col[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } /* else */ - - } /* else */ - - } /* if... */ - - } /* while xdfs < maxdfs */ - - /* krow has no more unexplored nbrs: - * Place snode-rep krep in postorder DFS, if this - * segment is seen for the first time. (Note that - * "repfnz[krep]" may change later.) - * Backtrack dfs to its parent. - */ - if ( marker1[krep] < jcol ) { - segrep[*nseg] = krep; - ++(*nseg); - marker1[krep] = jj; - } - - kpar = parent[krep]; /* Pop stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } while ( kpar != EMPTY ); /* do-while - until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonz in A[*,jj] */ - - repfnz_col += m; /* Move to next column */ - dense_col += m; - - } /* for jj ... */ - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,108 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include "slu_cdefs.h" - -float -cPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, - SuperMatrix *L, SuperMatrix *U) -{ -/* - * Purpose - * ======= - * - * Compute the reciprocal pivot growth factor of the leading ncols columns - * of the matrix, using the formula: - * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) - * - * Arguments - * ========= - * - * ncols (input) int - * The number of columns of matrices A, L and U. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = NC; Dtype = SLU_C; Mtype = GE. - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SC; Dtype = SLU_C; Mtype = TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = NC; - * Dtype = SLU_C; Mtype = TRU. - * - */ - NCformat *Astore; - SCformat *Lstore; - NCformat *Ustore; - complex *Aval, *Lval, *Uval; - int fsupc, nsupr, luptr, nz_in_U; - int i, j, k, oldcol; - int *inv_perm_c; - float rpg, maxaj, maxuj; - extern double slamch_(char *); - float smlnum; - complex *luval; - complex temp_comp; - - /* Get machine constants. */ - smlnum = slamch_("S"); - rpg = 1. / smlnum; - - Astore = A->Store; - Lstore = L->Store; - Ustore = U->Store; - Aval = Astore->nzval; - Lval = Lstore->nzval; - Uval = Ustore->nzval; - - inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); - for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; - - for (k = 0; k <= Lstore->nsuper; ++k) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - luptr = L_NZ_START(fsupc); - luval = &Lval[luptr]; - nz_in_U = 1; - - for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { - maxaj = 0.; - oldcol = inv_perm_c[j]; - for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) - maxaj = SUPERLU_MAX( maxaj, c_abs1( &Aval[i]) ); - - maxuj = 0.; - for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) - maxuj = SUPERLU_MAX( maxuj, c_abs1( &Uval[i]) ); - - /* Supernode */ - for (i = 0; i < nz_in_U; ++i) - maxuj = SUPERLU_MAX( maxuj, c_abs1( &luval[i]) ); - - ++nz_in_U; - luval += nsupr; - - if ( maxuj == 0. ) - rpg = SUPERLU_MIN( rpg, 1.); - else - rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); - } - - if ( j >= ncols ) break; - } - - SUPERLU_FREE(inv_perm_c); - return (rpg); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,174 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_cdefs.h" - -#undef DEBUG - -int -cpivotL( - const int jcol, /* in */ - const float u, /* in - diagonal pivoting threshold */ - int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ - int *perm_r, /* may be modified */ - int *iperm_r, /* in - inverse of perm_r */ - int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ - int *pivrow, /* out */ - GlobalLU_t *Glu, /* modified - global LU data structures */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * Performs the numerical pivoting on the current column of L, - * and the CDIV operation. - * - * Pivot policy: - * (1) Compute thresh = u * max_(i>=j) abs(A_ij); - * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN - * pivot row = k; - * ELSE IF abs(A_jj) >= thresh THEN - * pivot row = j; - * ELSE - * pivot row = m; - * - * Note: If you absolutely want to use a given pivot order, then set u=0.0. - * - * Return value: 0 success; - * i > 0 U(i,i) is exactly zero. - * - */ - complex one = {1.0, 0.0}; - int fsupc; /* first column in the supernode */ - int nsupc; /* no of columns in the supernode */ - int nsupr; /* no of rows in the supernode */ - int lptr; /* points to the starting subscript of the supernode */ - int pivptr, old_pivptr, diag, diagind; - float pivmax, rtemp, thresh; - complex temp; - complex *lu_sup_ptr; - complex *lu_col_ptr; - int *lsub_ptr; - int isub, icol, k, itemp; - int *lsub, *xlsub; - complex *lusup; - int *xlusup; - flops_t *ops = stat->ops; - - /* Initialize pointers */ - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; - nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ - lptr = xlsub[fsupc]; - nsupr = xlsub[fsupc+1] - lptr; - lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ - lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ - lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ - -#ifdef DEBUG -if ( jcol == MIN_COL ) { - printf("Before cdiv: col %d\n", jcol); - for (k = nsupc; k < nsupr; k++) - printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); -} -#endif - - /* Determine the largest abs numerical value for partial pivoting; - Also search for user-specified pivot, and diagonal element. */ - if ( *usepr ) *pivrow = iperm_r[jcol]; - diagind = iperm_c[jcol]; - pivmax = 0.0; - pivptr = nsupc; - diag = EMPTY; - old_pivptr = nsupc; - for (isub = nsupc; isub < nsupr; ++isub) { - rtemp = c_abs1 (&lu_col_ptr[isub]); - if ( rtemp > pivmax ) { - pivmax = rtemp; - pivptr = isub; - } - if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; - if ( lsub_ptr[isub] == diagind ) diag = isub; - } - - /* Test for singularity */ - if ( pivmax == 0.0 ) { - *pivrow = lsub_ptr[pivptr]; - perm_r[*pivrow] = jcol; - *usepr = 0; - return (jcol+1); - } - - thresh = u * pivmax; - - /* Choose appropriate pivotal element by our policy. */ - if ( *usepr ) { - rtemp = c_abs1 (&lu_col_ptr[old_pivptr]); - if ( rtemp != 0.0 && rtemp >= thresh ) - pivptr = old_pivptr; - else - *usepr = 0; - } - if ( *usepr == 0 ) { - /* Use diagonal pivot? */ - if ( diag >= 0 ) { /* diagonal exists */ - rtemp = c_abs1 (&lu_col_ptr[diag]); - if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; - } - *pivrow = lsub_ptr[pivptr]; - } - - /* Record pivot row */ - perm_r[*pivrow] = jcol; - - /* Interchange row subscripts */ - if ( pivptr != nsupc ) { - itemp = lsub_ptr[pivptr]; - lsub_ptr[pivptr] = lsub_ptr[nsupc]; - lsub_ptr[nsupc] = itemp; - - /* Interchange numerical values as well, for the whole snode, such - * that L is indexed the same way as A. - */ - for (icol = 0; icol <= nsupc; icol++) { - itemp = pivptr + icol * nsupr; - temp = lu_sup_ptr[itemp]; - lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; - lu_sup_ptr[nsupc + icol*nsupr] = temp; - } - } /* if */ - - /* cdiv operation */ - ops[FACT] += 10 * (nsupr - nsupc); - - c_div(&temp, &one, &lu_col_ptr[nsupc]); - for (k = nsupc+1; k < nsupr; k++) - cc_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpruneL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpruneL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpruneL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,147 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - -void -cpruneL( - const int jcol, /* in */ - const int *perm_r, /* in */ - const int pivrow, /* in */ - const int nseg, /* in */ - const int *segrep, /* in */ - const int *repfnz, /* in */ - int *xprune, /* out */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ -/* - * Purpose - * ======= - * Prunes the L-structure of supernodes whose L-structure - * contains the current pivot row "pivrow" - * - */ - complex utemp; - int jsupno, irep, irep1, kmin, kmax, krow, movnum; - int i, ktemp, minloc, maxloc; - int do_prune; /* logical variable */ - int *xsup, *supno; - int *lsub, *xlsub; - complex *lusup; - int *xlusup; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - /* - * For each supernode-rep irep in U[*,j] - */ - jsupno = supno[jcol]; - for (i = 0; i < nseg; i++) { - - irep = segrep[i]; - irep1 = irep + 1; - do_prune = FALSE; - - /* Don't prune with a zero U-segment */ - if ( repfnz[irep] == EMPTY ) - continue; - - /* If a snode overlaps with the next panel, then the U-segment - * is fragmented into two parts -- irep and irep1. We should let - * pruning occur at the rep-column in irep1's snode. - */ - if ( supno[irep] == supno[irep1] ) /* Don't prune */ - continue; - - /* - * If it has not been pruned & it has a nonz in row L[pivrow,i] - */ - if ( supno[irep] != jsupno ) { - if ( xprune[irep] >= xlsub[irep1] ) { - kmin = xlsub[irep]; - kmax = xlsub[irep1] - 1; - for (krow = kmin; krow <= kmax; krow++) - if ( lsub[krow] == pivrow ) { - do_prune = TRUE; - break; - } - } - - if ( do_prune ) { - - /* Do a quicksort-type partition - * movnum=TRUE means that the num values have to be exchanged. - */ - movnum = FALSE; - if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ - movnum = TRUE; - - while ( kmin <= kmax ) { - - if ( perm_r[lsub[kmax]] == EMPTY ) - kmax--; - else if ( perm_r[lsub[kmin]] != EMPTY ) - kmin++; - else { /* kmin below pivrow, and kmax above pivrow: - * interchange the two subscripts - */ - ktemp = lsub[kmin]; - lsub[kmin] = lsub[kmax]; - lsub[kmax] = ktemp; - - /* If the supernode has only one column, then we - * only keep one set of subscripts. For any subscript - * interchange performed, similar interchange must be - * done on the numerical values. - */ - if ( movnum ) { - minloc = xlusup[irep] + (kmin - xlsub[irep]); - maxloc = xlusup[irep] + (kmax - xlsub[irep]); - utemp = lusup[minloc]; - lusup[minloc] = lusup[maxloc]; - lusup[maxloc] = utemp; - } - - kmin++; - kmax--; - - } - - } /* while */ - - xprune[irep] = kmin; /* Pruning */ - -#ifdef CHK_PRUNE - printf(" After cpruneL(),using col %d: xprune[%d] = %d\n", - jcol, irep, kmin); -#endif - } /* if do_prune */ - - } /* if */ - - } /* for each U-segment... */ -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/creadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/creadhb.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/creadhb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/creadhb.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,265 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include -#include "slu_cdefs.h" - - -/* Eat up the rest of the current line */ -int cDumpLine(FILE *fp) -{ - register int c; - while ((c = fgetc(fp)) != '\n') ; - return 0; -} - -int cParseIntFormat(char *buf, int *num, int *size) -{ - char *tmp; - - tmp = buf; - while (*tmp++ != '(') ; - sscanf(tmp, "%d", num); - while (*tmp != 'I' && *tmp != 'i') ++tmp; - ++tmp; - sscanf(tmp, "%d", size); - return 0; -} - -int cParseFloatFormat(char *buf, int *num, int *size) -{ - char *tmp, *period; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' - && *tmp != 'F' && *tmp != 'f') { - /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the - num picked up refers to P, which should be skipped. */ - if (*tmp=='p' || *tmp=='P') { - ++tmp; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - } else { - ++tmp; - } - } - ++tmp; - period = tmp; - while (*period != '.' && *period != ')') ++period ; - *period = '\0'; - *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ - - return 0; -} - -int cReadVector(FILE *fp, int n, int *where, int perline, int persize) -{ - register int i, j, item; - char tmp, buf[100]; - - i = 0; - while (i < n) { - fgets(buf, 100, fp); /* read a line at a time */ - for (j=0; jops; - - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - nextlu = xlusup[jcol]; - - /* - * Process the supernodal portion of L\U[*,j] - */ - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = comp_zero; - ++nextlu; - } - - xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ - - if ( fsupc < jcol ) { - - luptr = xlusup[fsupc]; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nsupc = jcol - fsupc; /* Excluding jcol */ - ufirst = xlusup[jcol]; /* Points to the beginning of column - jcol in supernode L\U(jsupno). */ - nrow = nsupr - nsupc; - - ops[TRSV] += 4 * nsupc * (nsupc - 1); - ops[GEMV] += 8 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], &tempv[0] ); - - /* Scatter tempv[*] into lusup[*] */ - iptr = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - c_sub(&lusup[iptr], &lusup[iptr], &tempv[i]); - ++iptr; - tempv[i] = comp_zero; - } -#endif - - } - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csnode_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csnode_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csnode_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_cdefs.h" - -int -csnode_dfs ( - const int jcol, /* in - start of the supernode */ - const int kcol, /* in - end of the supernode */ - const int *asub, /* in */ - const int *xa_begin, /* in */ - const int *xa_end, /* in */ - int *xprune, /* out */ - int *marker, /* modified */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* Purpose - * ======= - * csnode_dfs() - Determine the union of the row structures of those - * columns within the relaxed snode. - * Note: The relaxed snodes are leaves of the supernodal etree, therefore, - * the portion outside the rectangular supernode must be zero. - * - * Return value - * ============ - * 0 success; - * >0 number of bytes allocated when run out of memory. - * - */ - register int i, k, ifrom, ito, nextl, new_next; - int nsuper, krow, kmark, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - int nzlmax; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - nsuper = ++supno[jcol]; /* Next available supernode number */ - nextl = xlsub[jcol]; - - for (i = jcol; i <= kcol; i++) { - /* For each nonzero in A[*,i] */ - for (k = xa_begin[i]; k < xa_end[i]; k++) { - krow = asub[k]; - kmark = marker[krow]; - if ( kmark != kcol ) { /* First time visit krow */ - marker[krow] = kcol; - lsub[nextl++] = krow; - if ( nextl >= nzlmax ) { - if ( mem_error = cLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - } - } - supno[i] = nsuper; - } - - /* Supernode > 1, then make a copy of the subscripts for pruning */ - if ( jcol < kcol ) { - new_next = nextl + (nextl - xlsub[jcol]); - while ( new_next > nzlmax ) { - if ( mem_error = cLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - ito = nextl; - for (ifrom = xlsub[jcol]; ifrom < nextl; ) - lsub[ito++] = lsub[ifrom++]; - for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; - nextl = ito; - } - - xsup[nsuper+1] = kcol + 1; - supno[kcol+1] = nsuper; - xprune[kcol] = nextl; - xlsub[kcol+1] = nextl; - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,565 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: csp_blas2.c - * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. - */ - -#include "slu_cdefs.h" - -/* - * Function prototypes - */ -void cusolve(int, int, complex*, complex*); -void clsolve(int, int, complex*, complex*); -void cmatvec(int, int, int, complex*, complex*, complex*); - - -int -sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, - SuperMatrix *U, complex *x, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * sp_ctrsv() solves one of the systems of equations - * A*x = b, or A'*x = b, - * where b and x are n element vectors and A is a sparse unit , or - * non-unit, upper or lower triangular matrix. - * No test for singularity or near-singularity is included in this - * routine. Such tests must be performed before calling this routine. - * - * Parameters - * ========== - * - * uplo - (input) char* - * On entry, uplo specifies whether the matrix is an upper or - * lower triangular matrix as follows: - * uplo = 'U' or 'u' A is an upper triangular matrix. - * uplo = 'L' or 'l' A is a lower triangular matrix. - * - * trans - (input) char* - * On entry, trans specifies the equations to be solved as - * follows: - * trans = 'N' or 'n' A*x = b. - * trans = 'T' or 't' A'*x = b. - * trans = 'C' or 'c' A^H*x = b. - * - * diag - (input) char* - * On entry, diag specifies whether or not A is unit - * triangular as follows: - * diag = 'U' or 'u' A is assumed to be unit triangular. - * diag = 'N' or 'n' A is not assumed to be unit - * triangular. - * - * L - (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU. - * - * U - (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. - * U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU. - * - * x - (input/output) complex* - * Before entry, the incremented array X must contain the n - * element right-hand side vector b. On exit, X is overwritten - * with the solution vector x. - * - * info - (output) int* - * If *info = -i, the i-th argument had an illegal value. - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - SCformat *Lstore; - NCformat *Ustore; - complex *Lval, *Uval; - int incx = 1, incy = 1; - complex temp; - complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; - complex comp_zero = {0.0, 0.0}; - int nrow; - int fsupc, nsupr, nsupc, luptr, istart, irow; - int i, k, iptr, jcol; - complex *work; - flops_t solve_ops; - - /* Test the input parameters */ - *info = 0; - if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; - else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && - !lsame_(trans, "C")) *info = -2; - else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; - else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; - else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; - if ( *info ) { - i = -(*info); - xerbla_("sp_ctrsv", &i); - return 0; - } - - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( !(work = complexCalloc(L->nrow)) ) - ABORT("Malloc fails for work in sp_ctrsv()."); - - if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L)*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - nrow = nsupr - nsupc; - - /* 1 c_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; - solve_ops += 8 * nrow * nsupc; - - if ( nsupc == 1 ) { - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { - irow = L_SUB(iptr); - ++luptr; - cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]); - c_sub(&x[irow], &x[irow], &comp_zero); - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#else - ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#endif -#else - clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); - - cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], - &x[fsupc], &work[0] ); -#endif - - iptr = istart + nsupc; - for (i = 0; i < nrow; ++i, ++iptr) { - irow = L_SUB(iptr); - c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ - work[i] = comp_zero; - - } - } - } /* for k ... */ - - } else { - /* Form x := inv(U)*x */ - - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - /* 1 c_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; - - if ( nsupc == 1 ) { - c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); - for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { - irow = U_SUB(i); - cc_mult(&comp_zero, &x[fsupc], &Uval[i]); - c_sub(&x[irow], &x[irow], &comp_zero); - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif -#else - cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); -#endif - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); - i++) { - irow = U_SUB(i); - cc_mult(&comp_zero, &x[jcol], &Uval[i]); - c_sub(&x[irow], &x[irow], &comp_zero); - } - } - } - } /* for k ... */ - - } - } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L')*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 8 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - cc_mult(&comp_zero, &x[irow], &Lval[i]); - c_sub(&x[jcol], &x[jcol], &comp_zero); - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += 4 * nsupc * (nsupc - 1); -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := inv(U')*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - cc_mult(&comp_zero, &x[irow], &Uval[i]); - c_sub(&x[jcol], &x[jcol], &comp_zero); - } - } - - /* 1 c_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; - - if ( nsupc == 1 ) { - c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); - } else { -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } else { /* Form x := conj(inv(A'))*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := conj(inv(L'))*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 8 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - cc_conj(&temp, &Lval[i]); - cc_mult(&comp_zero, &x[irow], &temp); - c_sub(&x[jcol], &x[jcol], &comp_zero); - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += 4 * nsupc * (nsupc - 1); -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd(trans, strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ctrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := conj(inv(U'))*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - cc_conj(&temp, &Uval[i]); - cc_mult(&comp_zero, &x[irow], &temp); - c_sub(&x[jcol], &x[jcol], &comp_zero); - } - } - - /* 1 c_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; - - if ( nsupc == 1 ) { - cc_conj(&temp, &Lval[luptr]); - c_div(&x[fsupc], &x[fsupc], &temp); - } else { -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd(trans, strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ctrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } - - stat->ops[SOLVE] += solve_ops; - SUPERLU_FREE(work); - return 0; -} - - - -int -sp_cgemv(char *trans, complex alpha, SuperMatrix *A, complex *x, - int incx, complex beta, complex *y, int incy) -{ -/* Purpose - ======= - - sp_cgemv() performs one of the matrix-vector operations - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - where alpha and beta are scalars, x and y are vectors and A is a - sparse A->nrow by A->ncol matrix. - - Parameters - ========== - - TRANS - (input) char* - On entry, TRANS specifies the operation to be performed as - follows: - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - ALPHA - (input) complex - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - - X - (input) complex*, array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - - INCX - (input) int - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - - BETA - (input) complex - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - - Y - (output) complex*, array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - updated vector y. - - INCY - (input) int - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - - ==== Sparse Level 2 Blas routine. -*/ - - /* Local variables */ - NCformat *Astore; - complex *Aval; - int info; - complex temp, temp1; - int lenx, leny, i, j, irow; - int iy, jx, jy, kx, ky; - int notran; - complex comp_zero = {0.0, 0.0}; - complex comp_one = {1.0, 0.0}; - - notran = lsame_(trans, "N"); - Astore = A->Store; - Aval = Astore->nzval; - - /* Test the input parameters */ - info = 0; - if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; - else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; - else if (incx == 0) info = 5; - else if (incy == 0) info = 8; - if (info != 0) { - xerbla_("sp_cgemv ", &info); - return 0; - } - - /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || - c_eq(&alpha, &comp_zero) && - c_eq(&beta, &comp_one)) - return 0; - - - /* Set LENX and LENY, the lengths of the vectors x and y, and set - up the start points in X and Y. */ - if (lsame_(trans, "N")) { - lenx = A->ncol; - leny = A->nrow; - } else { - lenx = A->nrow; - leny = A->ncol; - } - if (incx > 0) kx = 0; - else kx = - (lenx - 1) * incx; - if (incy > 0) ky = 0; - else ky = - (leny - 1) * incy; - - /* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - /* First form y := beta*y. */ - if ( !c_eq(&beta, &comp_one) ) { - if (incy == 1) { - if ( c_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) y[i] = comp_zero; - else - for (i = 0; i < leny; ++i) - cc_mult(&y[i], &beta, &y[i]); - } else { - iy = ky; - if ( c_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) { - y[iy] = comp_zero; - iy += incy; - } - else - for (i = 0; i < leny; ++i) { - cc_mult(&y[iy], &beta, &y[iy]); - iy += incy; - } - } - } - - if ( c_eq(&alpha, &comp_zero) ) return 0; - - if ( notran ) { - /* Form y := alpha*A*x + y. */ - jx = kx; - if (incy == 1) { - for (j = 0; j < A->ncol; ++j) { - if ( !c_eq(&x[jx], &comp_zero) ) { - cc_mult(&temp, &alpha, &x[jx]); - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - cc_mult(&temp1, &temp, &Aval[i]); - c_add(&y[irow], &y[irow], &temp1); - } - } - jx += incx; - } - } else { - ABORT("Not implemented."); - } - } else { - /* Form y := alpha*A'*x + y. */ - jy = ky; - if (incx == 1) { - for (j = 0; j < A->ncol; ++j) { - temp = comp_zero; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - cc_mult(&temp1, &Aval[i], &x[irow]); - c_add(&temp, &temp, &temp1); - } - cc_mult(&temp1, &alpha, &temp); - c_add(&y[jy], &y[jy], &temp1); - jy += incy; - } - } else { - ABORT("Not implemented."); - } - } - return 0; -} /* sp_cgemv */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas3.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: sp_blas3.c - * Purpose: Sparse BLAS3, using some dense BLAS3 operations. - */ - -#include "slu_cdefs.h" - -int -sp_cgemm(char *transa, char *transb, int m, int n, int k, - complex alpha, SuperMatrix *A, complex *b, int ldb, - complex beta, complex *c, int ldc) -{ -/* Purpose - ======= - - sp_c performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - (input) char* - On entry, TRANSA specifies the form of op( A ) to be used in - the matrix multiplication as follows: - TRANSA = 'N' or 'n', op( A ) = A. - TRANSA = 'T' or 't', op( A ) = A'. - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - Unchanged on exit. - - TRANSB - (input) char* - On entry, TRANSB specifies the form of op( B ) to be used in - the matrix multiplication as follows: - TRANSB = 'N' or 'n', op( B ) = B. - TRANSB = 'T' or 't', op( B ) = B'. - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - Unchanged on exit. - - M - (input) int - On entry, M specifies the number of rows of the matrix - op( A ) and of the matrix C. M must be at least zero. - Unchanged on exit. - - N - (input) int - On entry, N specifies the number of columns of the matrix - op( B ) and the number of columns of the matrix C. N must be - at least zero. - Unchanged on exit. - - K - (input) int - On entry, K specifies the number of columns of the matrix - op( A ) and the number of rows of the matrix op( B ). K must - be at least zero. - Unchanged on exit. - - ALPHA - (input) complex - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = SLU_C; Mtype = GE. - In the future, more general A can be handled. - - B - COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - part of the array B must contain the matrix B, otherwise - the leading n by k part of the array B must contain the - matrix B. - Unchanged on exit. - - LDB - (input) int - On entry, LDB specifies the first dimension of B as declared - in the calling (sub) program. LDB must be at least max( 1, n ). - Unchanged on exit. - - BETA - (input) complex - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then C need not be set on input. - - C - COMPLEX PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - contain the matrix C, except when beta is zero, in which - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - ( alpha*op( A )*B + beta*C ). - - LDC - (input) int - On entry, LDC specifies the first dimension of C as declared - in the calling (sub)program. LDC must be at least max(1,m). - Unchanged on exit. - - ==== Sparse Level 3 Blas routine. -*/ - int incx = 1, incy = 1; - int j; - - for (j = 0; j < n; ++j) { - sp_cgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cutil.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,485 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include "slu_cdefs.h" - -void -cCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, - complex *nzval, int *rowind, int *colptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NCformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->rowind = rowind; - Astore->colptr = colptr; -} - -void -cCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, - complex *nzval, int *colind, int *rowptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NRformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->colind = colind; - Astore->rowptr = rowptr; -} - -/* Copy matrix A into matrix B. */ -void -cCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore, *Bstore; - int ncol, nnz, i; - - B->Stype = A->Stype; - B->Dtype = A->Dtype; - B->Mtype = A->Mtype; - B->nrow = A->nrow;; - B->ncol = ncol = A->ncol; - Astore = (NCformat *) A->Store; - Bstore = (NCformat *) B->Store; - Bstore->nnz = nnz = Astore->nnz; - for (i = 0; i < nnz; ++i) - ((complex *)Bstore->nzval)[i] = ((complex *)Astore->nzval)[i]; - for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; - for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; -} - - -void -cCreate_Dense_Matrix(SuperMatrix *X, int m, int n, complex *x, int ldx, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - DNformat *Xstore; - - X->Stype = stype; - X->Dtype = dtype; - X->Mtype = mtype; - X->nrow = m; - X->ncol = n; - X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); - Xstore = (DNformat *) X->Store; - Xstore->lda = ldx; - Xstore->nzval = (complex *) x; -} - -void -cCopy_Dense_Matrix(int M, int N, complex *X, int ldx, - complex *Y, int ldy) -{ -/* - * - * Purpose - * ======= - * - * Copies a two-dimensional matrix X to another matrix Y. - */ - int i, j; - - for (j = 0; j < N; ++j) - for (i = 0; i < M; ++i) - Y[i + j*ldy] = X[i + j*ldx]; -} - -void -cCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, - complex *nzval, int *nzval_colptr, int *rowind, - int *rowind_colptr, int *col_to_sup, int *sup_to_col, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - SCformat *Lstore; - - L->Stype = stype; - L->Dtype = dtype; - L->Mtype = mtype; - L->nrow = m; - L->ncol = n; - L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); - if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); - Lstore = L->Store; - Lstore->nnz = nnz; - Lstore->nsuper = col_to_sup[n]; - Lstore->nzval = nzval; - Lstore->nzval_colptr = nzval_colptr; - Lstore->rowind = rowind; - Lstore->rowind_colptr = rowind_colptr; - Lstore->col_to_sup = col_to_sup; - Lstore->sup_to_col = sup_to_col; - -} - - -/* - * Convert a row compressed storage into a column compressed storage. - */ -void -cCompRow_to_CompCol(int m, int n, int nnz, - complex *a, int *colind, int *rowptr, - complex **at, int **rowind, int **colptr) -{ - register int i, j, col, relpos; - int *marker; - - /* Allocate storage for another copy of the matrix. */ - *at = (complex *) complexMalloc(nnz); - *rowind = (int *) intMalloc(nnz); - *colptr = (int *) intMalloc(n+1); - marker = (int *) intCalloc(n); - - /* Get counts of each column of A, and set up column pointers */ - for (i = 0; i < m; ++i) - for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; - (*colptr)[0] = 0; - for (j = 0; j < n; ++j) { - (*colptr)[j+1] = (*colptr)[j] + marker[j]; - marker[j] = (*colptr)[j]; - } - - /* Transfer the matrix into the compressed column storage. */ - for (i = 0; i < m; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - col = colind[j]; - relpos = marker[col]; - (*rowind)[relpos] = i; - (*at)[relpos] = a[j]; - ++marker[col]; - } - } - - SUPERLU_FREE(marker); -} - - -void -cPrint_CompCol_Matrix(char *what, SuperMatrix *A) -{ - NCformat *Astore; - register int i,n; - float *dp; - - printf("\nCompCol matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (NCformat *) A->Store; - dp = (float *) Astore->nzval; - printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); - printf("nzval: "); - for (i = 0; i < 2*Astore->colptr[n]; ++i) printf("%f ", dp[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); - printf("\ncolptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); - printf("\n"); - fflush(stdout); -} - -void -cPrint_SuperNode_Matrix(char *what, SuperMatrix *A) -{ - SCformat *Astore; - register int i, j, k, c, d, n, nsup; - float *dp; - int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; - - printf("\nSuperNode matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (SCformat *) A->Store; - dp = (float *) Astore->nzval; - col_to_sup = Astore->col_to_sup; - sup_to_col = Astore->sup_to_col; - rowind_colptr = Astore->rowind_colptr; - rowind = Astore->rowind; - printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", - A->nrow,A->ncol,Astore->nnz,Astore->nsuper); - printf("nzval:\n"); - for (k = 0; k <= Astore->nsuper; ++k) { - c = sup_to_col[k]; - nsup = sup_to_col[k+1] - c; - for (j = c; j < c + nsup; ++j) { - d = Astore->nzval_colptr[j]; - for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { - printf("%d\t%d\t%e\t%e\n", rowind[i], j, dp[d], dp[d+1]); - d += 2; - } - } - } -#if 0 - for (i = 0; i < 2*Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); -#endif - printf("\nnzval_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->rowind_colptr[n]; ++i) - printf("%d ", Astore->rowind[i]); - printf("\nrowind_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); - printf("\ncol_to_sup: "); - for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); - printf("\nsup_to_col: "); - for (i = 0; i <= Astore->nsuper+1; ++i) - printf("%d ", sup_to_col[i]); - printf("\n"); - fflush(stdout); -} - -void -cPrint_Dense_Matrix(char *what, SuperMatrix *A) -{ - DNformat *Astore; - register int i, j, lda = Astore->lda; - float *dp; - - printf("\nDense matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (DNformat *) A->Store; - dp = (float *) Astore->nzval; - printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda); - printf("\nnzval: "); - for (j = 0; j < A->ncol; ++j) { - for (i = 0; i < 2*A->nrow; ++i) printf("%f ", dp[i + j*2*lda]); - printf("\n"); - } - printf("\n"); - fflush(stdout); -} - -/* - * Diagnostic print of column "jcol" in the U/L factor. - */ -void -cprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) -{ - int i, k, fsupc; - int *xsup, *supno; - int *xlsub, *lsub; - complex *lusup; - int *xlusup; - complex *ucol; - int *usub, *xusub; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - - printf("%s", msg); - printf("col %d: pivrow %d, supno %d, xprune %d\n", - jcol, pivrow, supno[jcol], xprune[jcol]); - - printf("\tU-col:\n"); - for (i = xusub[jcol]; i < xusub[jcol+1]; i++) - printf("\t%d%10.4f, %10.4f\n", usub[i], ucol[i].r, ucol[i].i); - printf("\tL-col in rectangular snode:\n"); - fsupc = xsup[supno[jcol]]; /* first col of the snode */ - i = xlsub[fsupc]; - k = xlusup[jcol]; - while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { - printf("\t%d\t%10.4f, %10.4f\n", lsub[i], lusup[k].r, lusup[k].i); - i++; k++; - } - fflush(stdout); -} - - -/* - * Check whether tempv[] == 0. This should be true before and after - * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". - */ -void ccheck_tempv(int n, complex *tempv) -{ - int i; - - for (i = 0; i < n; i++) { - if ((tempv[i].r != 0.0) || (tempv[i].i != 0.0)) - { - fprintf(stderr,"tempv[%d] = {%f, %f}\n", i, tempv[i].r, tempv[i].i); - ABORT("ccheck_tempv"); - } - } -} - - -void -cGenXtrue(int n, int nrhs, complex *x, int ldx) -{ - int i, j; - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) { - x[i + j*ldx].r = 1.0; - x[i + j*ldx].i = 0.0; - } -} - -/* - * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's - */ -void -cFillRHS(trans_t trans, int nrhs, complex *x, int ldx, - SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore; - complex *Aval; - DNformat *Bstore; - complex *rhs; - complex one = {1.0, 0.0}; - complex zero = {0.0, 0.0}; - int ldc; - char transc[1]; - - Astore = A->Store; - Aval = (complex *) Astore->nzval; - Bstore = B->Store; - rhs = Bstore->nzval; - ldc = Bstore->lda; - - if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; - else *(unsigned char *)transc = 'T'; - - sp_cgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A, - x, ldx, zero, rhs, ldc); - -} - -/* - * Fills a complex precision array with a given value. - */ -void -cfill(complex *a, int alen, complex dval) -{ - register int i; - for (i = 0; i < alen; i++) a[i] = dval; -} - - - -/* - * Check the inf-norm of the error vector - */ -void cinf_norm_error(int nrhs, SuperMatrix *X, complex *xtrue) -{ - DNformat *Xstore; - float err, xnorm; - complex *Xmat, *soln_work; - complex temp; - int i, j; - - Xstore = X->Store; - Xmat = Xstore->nzval; - - for (j = 0; j < nrhs; j++) { - soln_work = &Xmat[j*Xstore->lda]; - err = xnorm = 0.0; - for (i = 0; i < X->nrow; i++) { - c_sub(&temp, &soln_work[i], &xtrue[i]); - err = SUPERLU_MAX(err, c_abs(&temp)); - xnorm = SUPERLU_MAX(xnorm, c_abs(&soln_work[i])); - } - err = err / xnorm; - printf("||X - Xtrue||/||X|| = %e\n", err); - } -} - - - -/* Print performance of the code. */ -void -cPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, - float rpg, float rcond, float *ferr, - float *berr, char *equed, SuperLUStat_t *stat) -{ - SCformat *Lstore; - NCformat *Ustore; - double *utime; - flops_t *ops; - - utime = stat->utime; - ops = stat->ops; - - if ( utime[FACT] != 0. ) - printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], - ops[FACT]*1e-6/utime[FACT]); - printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); - if ( utime[SOLVE] != 0. ) - printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], - ops[SOLVE]*1e-6/utime[SOLVE]); - - Lstore = (SCformat *) L->Store; - Ustore = (NCformat *) U->Store; - printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); - printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); - printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); - - printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n", - mem_usage->for_lu/1e6, mem_usage->total_needed/1e6, - mem_usage->expansions); - - printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); - printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", - utime[FACT], ops[FACT]*1e-6/utime[FACT], - utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], - utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); - - printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); - printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", - rpg, rcond, ferr[0], berr[0], equed); - -} - - - - -print_complex_vec(char *what, int n, complex *vec) -{ - int i; - printf("%s: n %d\n", what, n); - for (i = 0; i < n; ++i) printf("%d\t%f%f\n", i, vec[i].r, vec[i].i); - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,361 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include -#include "slu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif - -/* - * Function prototypes -void sludusolve(int, int, double*, double*); - */ -void sludlsolve(int, int, double*, double*); -void sludmatvec(int, int, int, double*, double*, double*); - - -/* Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -dcolumn_bmod ( - const int jcol, /* in */ - const int nseg, /* in */ - double *dense, /* in */ - double *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in */ - int fpanelc, /* in -- first column in the current panel */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose: - * ======== - * Performs numeric block updates (sup-col) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif -#ifdef USE_VENDOR_BLAS - int incx = 1, incy = 1; - double alpha, beta; -#endif - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in supernode - * nsupr = no of rows in supernode (used as leading dimension) - * luptr = location of supernodal LU-block in storage - * kfnz = first nonz in the k-th supernodal segment - * no_zeros = no of leading zeros in a supernodal U-segment - */ - double ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int fsupc, nsupc, nsupr, segsze; - int nrow; /* No of rows in the matrix of matrix-vector */ - int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; - register int lptr, kfnz, isub, irow, i; - register int no_zeros, new_next; - int ufirst, nextlu; - int fst_col; /* First column within small LU update */ - int d_fsupc; /* Distance between the first column of the current - panel and the first column of the current snode. */ - int *xsup, *supno; - int *lsub, *xlsub; - double *lusup; - int *xlusup; - int nzlumax; - double *tempv1; - double zero = 0.0; -#ifdef USE_VENDOR_BLAS - double one = 1.0; - double none = -1.0; -#endif - int mem_error; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - nzlumax = Glu->nzlumax; - jcolp1 = jcol + 1; - jsupno = supno[jcol]; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - - krep = segrep[k]; - k--; - ksupno = supno[krep]; - if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ - - fsupc = xsup[ksupno]; - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - /* Distance from the current supernode to the current panel; - d_fsupc=0 if fsupc > fpanelc. */ - d_fsupc = fst_col - fsupc; - - luptr = xlusup[fst_col] + d_fsupc; - lptr = xlsub[fsupc] + d_fsupc; - - kfnz = repfnz[krep]; - kfnz = SUPERLU_MAX ( kfnz, fpanelc ); - - segsze = krep - kfnz + 1; - nsupc = krep - fst_col + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nrow = nsupr - d_fsupc - nsupc; - krep_ind = lptr + nsupc - 1; - - ops[TRSV] += segsze * (segsze - 1); - ops[GEMV] += 2 * nrow * segsze; - - - /* - * Case 1: Update U-segment of size 1 -- col-col update - */ - if ( segsze == 1 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - dense[irow] -= ukj*lusup[luptr]; - luptr++; - } - - } else if ( segsze <= 3 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { /* Case 2: 2cols-col update */ - ukj -= ukj1 * lusup[luptr1]; - dense[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - dense[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] ); - } - } else { /* Case 3: 3cols-col update */ - ukj2 = dense[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - ukj1 -= ukj2 * lusup[luptr2-1]; - ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; - dense[lsub[krep_ind]] = ukj; - dense[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - luptr2++; - dense[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); - } - } - - - - } else { - /* - * Case: sup-col update - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense - */ - - no_zeros = kfnz - fst_col; - - /* Copy U[*,j] segment from dense[*] to tempv[*] */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - tempv[i] = dense[irow]; - ++isub; - } - - /* Dense triangular solve -- start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&segsze, - &lusup[luptr], &nsupr, tempv, &incx ); -#endif - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow,&segsze,&alpha, - &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, - &incy ); -#endif -#else - sludlsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - sludmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); -#endif - - - /* Scatter tempv[] into SPA dense[] as a temporary storage */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense[irow] = tempv[i]; - tempv[i] = zero; - ++isub; - } - - /* Scatter tempv1[] into SPA dense[] */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - dense[irow] -= tempv1[i]; - tempv1[i] = zero; - ++isub; - } - } - - } /* if jsupno ... */ - - } /* for each segment... */ - - /* - * Process the supernodal portion of L\U[*,j] - */ - nextlu = xlusup[jcol]; - fsupc = xsup[jsupno]; - - /* Copy the SPA dense into L\U[*,j] */ - new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; - while ( new_next > nzlumax ) { - if ((mem_error = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))) - return (mem_error); - lusup = Glu->lusup; - lsub = Glu->lsub; - } - - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = zero; - ++nextlu; - } - - xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ - - /* For more updates within the panel (also within the current supernode), - * should start from the first column of the panel, or the first column - * of the supernode, whichever is bigger. There are 2 cases: - * 1) fsupc < fpanelc, then fst_col := fpanelc - * 2) fsupc >= fpanelc, then fst_col := fsupc - */ - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - if ( fst_col < jcol ) { - - /* Distance between the current supernode and the current panel. - d_fsupc=0 if fsupc >= fpanelc. */ - d_fsupc = fst_col - fsupc; - - lptr = xlsub[fsupc] + d_fsupc; - luptr = xlusup[fst_col] + d_fsupc; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nsupc = jcol - fst_col; /* Excluding jcol */ - nrow = nsupr - d_fsupc - nsupc; - - /* Points to the beginning of jcol in snode L\U(jsupno) */ - ufirst = xlusup[jcol] + d_fsupc; - - ops[TRSV] += nsupc * (nsupc - 1); - ops[GEMV] += 2 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)( "L", "N", "U", &nsupc, - &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); -#endif - - alpha = none; beta = one; /* y := beta*y + alpha*A*x */ - -#ifdef _CRAY - SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N", &nrow, &nsupc, &alpha, - &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, - &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - sludlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - - sludmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], tempv ); - - /* Copy updates from tempv[*] into lusup[*] */ - isub = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - lusup[isub] -= tempv[i]; - tempv[i] = 0.0; - ++isub; - } - -#endif - - - } /* if fst_col < jcol ... */ - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,269 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -/* What type of supernodes we want */ -#define T2_SUPER - -int -dcolumn_dfs( - const int m, /* in - number of rows in the matrix */ - const int jcol, /* in */ - int *perm_r, /* in */ - int *nseg, /* modified - with new segments appended */ - int *lsub_col, /* in - defines the RHS vector to start the dfs */ - int *segrep, /* modified - with new segments appended */ - int *repfnz, /* modified */ - int *xprune, /* modified */ - int *marker, /* modified */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * "column_dfs" performs a symbolic factorization on column jcol, and - * decide the supernode boundary. - * - * This routine does not use numeric values, but only use the RHS - * row indices to start the dfs. - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. The routine returns a list of such supernodal - * representatives in topological order of the dfs that generates them. - * The location of the first nonzero in each such supernodal segment - * (supernodal entry location) is also returned. - * - * Local parameters - * ================ - * nseg: no of segments in current U[*,j] - * jsuper: jsuper=EMPTY if column j does not belong to the same - * supernode as j-1. Otherwise, jsuper=nsuper. - * - * marker2: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - * Return value - * ============ - * 0 success; - * > 0 number of bytes allocated when run out of space. - * - */ - int jcolp1, jcolm1, jsuper, nsuper, nextl; - int k, krep, krow, kmark, kperm; - int *marker2; /* Used for small panel LU */ - int fsupc; /* First column of a snode */ - int myfnz; /* First nonz column of a U-segment */ - int chperm, chmark, chrep, kchild; - int xdfs, maxdfs, kpar, oldrep; - int jptr, jm1ptr; - int ito, ifrom, istop; /* Used to compress row subscripts */ - int mem_error; - int *xsup, *supno, *lsub, *xlsub; - int nzlmax; - static int first = 1, maxsuper; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - if ( first ) { - maxsuper = sp_ienv(3); - first = 0; - } - jcolp1 = jcol + 1; - jcolm1 = jcol - 1; - nsuper = supno[jcol]; - jsuper = nsuper; - nextl = xlsub[jcol]; - marker2 = &marker[2*m]; - - - /* For each nonzero in A[*,jcol] do dfs */ - for (k = 0; lsub_col[k] != EMPTY; k++) { - - krow = lsub_col[k]; - lsub_col[k] = EMPTY; - kmark = marker2[krow]; - - /* krow was visited before, go to the next nonz */ - if ( kmark == jcol ) continue; - - /* For each unmarked nbr krow of jcol - * krow is in L: place it in structure of L[*,jcol] - */ - marker2[krow] = jcol; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - lsub[nextl++] = krow; /* krow is indexed into A */ - if ( nextl >= nzlmax ) { - if ( (mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ - } else { - /* krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz[krep]; - - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > kperm ) repfnz[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker2[kchild]; - - if ( chmark != jcol ) { /* Not reached yet */ - marker2[kchild] = jcol; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,k] */ - if ( chperm == EMPTY ) { - lsub[nextl++] = kchild; - if ( nextl >= nzlmax ) { - if ((mem_error = - dLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu))) - return (mem_error); - lsub = Glu->lsub; - } - if ( chmark != jcolm1 ) jsuper = EMPTY; - } else { - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz[chrep]; - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz[chrep] = chperm; - } else { - /* Continue dfs at super-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L^t) */ - parent[krep] = oldrep; - repfnz[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - } /* else */ - - } /* else */ - - } /* if */ - - } /* while */ - - /* krow has no more unexplored nbrs; - * place supernode-rep krep in postorder DFS. - * backtrack dfs to its parent - */ - segrep[*nseg] = krep; - ++(*nseg); - kpar = parent[krep]; /* Pop from stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - - } while ( kpar != EMPTY ); /* Until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonzero ... */ - - /* Check to see if j belongs in the same supernode as j-1 */ - if ( jcol == 0 ) { /* Do nothing for column 0 */ - nsuper = supno[0] = 0; - } else { - fsupc = xsup[nsuper]; - jptr = xlsub[jcol]; /* Not compressed yet */ - jm1ptr = xlsub[jcolm1]; - -#ifdef T2_SUPER - if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; -#endif - /* Make sure the number of columns in a supernode doesn't - exceed threshold. */ - if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; - - /* If jcol starts a new supernode, reclaim storage space in - * lsub from the previous supernode. Note we only store - * the subscript set of the first and last columns of - * a supernode. (first for num values, last for pruning) - */ - if ( jsuper == EMPTY ) { /* starts a new supernode */ - if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ -#ifdef CHK_COMPRESS - printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); -#endif - ito = xlsub[fsupc+1]; - xlsub[jcolm1] = ito; - istop = ito + jptr - jm1ptr; - xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ - xlsub[jcol] = istop; - for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) - lsub[ito] = lsub[ifrom]; - nextl = ito; /* = istop + length(jcol) */ - } - nsuper++; - supno[jcol] = nsuper; - } /* if a new supernode */ - - } /* else: jcol > 0 */ - - /* Tidy up the pointers before exit */ - xsup[nsuper+1] = jcolp1; - supno[jcolp1] = nsuper; - xprune[jcol] = nextl; /* Initialize upper bound for pruning */ - xlsub[jcolp1] = nextl; - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcomplex.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcomplex.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcomplex.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcomplex.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,106 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * This file defines common arithmetic operations for complex type. - */ -#include -#include -#include -#include "slu_dcomplex.h" - - -/* Complex Division c = a/b */ -void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) -{ - double ratio, den; - double abr, abi, cr, ci; - - if( (abr = b->r) < 0.) - abr = - abr; - if( (abi = b->i) < 0.) - abi = - abi; - if( abr <= abi ) { - if (abi == 0) { - fprintf(stderr, "z_div.c: division by zero\n"); - exit(-1); - } - ratio = b->r / b->i ; - den = b->i * (1 + ratio*ratio); - cr = (a->r*ratio + a->i) / den; - ci = (a->i*ratio - a->r) / den; - } else { - ratio = b->i / b->r ; - den = b->r * (1 + ratio*ratio); - cr = (a->r + a->i*ratio) / den; - ci = (a->i - a->r*ratio) / den; - } - c->r = cr; - c->i = ci; -} - - -/* Returns sqrt(z.r^2 + z.i^2) */ -double z_abs(doublecomplex *z) -{ - double temp; - double real = z->r; - double imag = z->i; - - if (real < 0) real = -real; - if (imag < 0) imag = -imag; - if (imag > real) { - temp = real; - real = imag; - imag = temp; - } - if ((real+imag) == real) return(real); - - temp = imag/real; - temp = real*sqrt(1.0 + temp*temp); /*overflow!!*/ - return (temp); -} - - -/* Approximates the abs */ -/* Returns abs(z.r) + abs(z.i) */ -double z_abs1(doublecomplex *z) -{ - double real = z->r; - double imag = z->i; - - if (real < 0) real = -real; - if (imag < 0) imag = -imag; - - return (real + imag); -} - -/* Return the exponentiation */ -void z_exp(doublecomplex *r, doublecomplex *z) -{ - double expx; - - expx = exp(z->r); - r->r = expx * cos(z->i); - r->i = expx * sin(z->i); -} - -/* Return the complex conjugate */ -void d_cnjg(doublecomplex *r, doublecomplex *z) -{ - r->r = z->r; - r->i = -z->i; -} - -/* Return the imaginary part */ -double d_imag(doublecomplex *z) -{ - return (z->i); -} - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,107 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ -/* - This file has been modified to take out compile warning and - to be compatible with the HYPRE linear solver -*/ - -#include "slu_ddefs.h" - -int -dcopy_to_ucol( - int jcol, /* in */ - int nseg, /* in */ - int *segrep, /* in */ - int *repfnz, /* in */ - int *perm_r, /* in */ - double *dense, /* modified - reset to zero on return */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Gather from SPA dense[*] to global ucol[*]. - */ - int ksub, krep, ksupno; - int i, k, kfnz, segsze; - int fsupc, isub, irow; - int jsupno, nextu; - int new_next, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - double *ucol; - int *usub, *xusub; - int nzumax; - - double zero = 0.0; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - nzumax = Glu->nzumax; - - jsupno = supno[jcol]; - nextu = xusub[jcol]; - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - krep = segrep[k--]; - ksupno = supno[krep]; - - if ( ksupno != jsupno ) { /* Should go into ucol[] */ - kfnz = repfnz[krep]; - if ( kfnz != EMPTY ) { /* Nonzero U-segment */ - - fsupc = xsup[ksupno]; - isub = xlsub[fsupc] + kfnz - fsupc; - segsze = krep - kfnz + 1; - - new_next = nextu + segsze; - while ( new_next > nzumax ) { - if ((mem_error = dLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu))) - return (mem_error); - ucol = Glu->ucol; - if ((mem_error = dLUMemXpand(jcol, nextu, USUB, &nzumax, Glu))) - return (mem_error); - usub = Glu->usub; - lsub = Glu->lsub; - } - - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - usub[nextu] = perm_r[irow]; - ucol[nextu] = dense[irow]; - dense[irow] = zero; - nextu++; - isub++; - } - - } - - } - - } /* for each segment... */ - - xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dGetDiagU.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dGetDiagU.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dGetDiagU.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dGetDiagU.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,59 +0,0 @@ -/* - * -- Auxiliary routine in SuperLU (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * Xiaoye S. Li - * September 11, 2003 - * - */ - -#include "dsp_defs.h" - - -void dGetDiagU(SuperMatrix *L, double *diagU) -{ - /* - * Purpose - * ======= - * - * GetDiagU extracts the main diagonal of matrix U of the LU factorization. - * - * Arguments - * ========= - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U as computed by - * dgstrf(). Use compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * diagU (output) double*, dimension (n) - * The main diagonal of matrix U. - * - * Note - * ==== - * The diagonal blocks of the L and U matrices are stored in the L - * data structures. - * - */ - int_t i, k, nsupers; - int_t fsupc, nsupr, nsupc, luptr; - double *dblock, *Lval; - SCformat *Lstore; - - Lstore = L->Store; - Lval = Lstore->nzval; - nsupers = Lstore->nsuper + 1; - - for (k = 0; k < nsupers; ++k) { - fsupc = L_FST_SUPC(k); - nsupc = L_FST_SUPC(k+1) - fsupc; - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - luptr = L_NZ_START(fsupc); - - dblock = &diagU[fsupc]; - for (i = 0; i < nsupc; ++i) { - dblock[i] = Lval[luptr]; - luptr += nsupr + 1; - } - } -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgscon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgscon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgscon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,151 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: dgscon.c - * History: Modified from lapack routines DGECON. - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include "slu_ddefs.h" - -void -dgscon(char *norm, SuperMatrix *L, SuperMatrix *U, - double anorm, double *rcond, SuperLUStat_t *stat, int *info) -{ -/* - Purpose - ======= - - DGSCON estimates the reciprocal of the condition number of a general - real matrix A, in either the 1-norm or the infinity-norm, using - the LU factorization computed by DGETRF. - - An estimate is obtained for norm(inv(A)), and the reciprocal of the - condition number is computed as - RCOND = 1 / ( norm(A) * norm(inv(A)) ). - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - NORM (input) char* - Specifies whether the 1-norm condition number or the - infinity-norm condition number is required: - = '1' or 'O': 1-norm; - = 'I': Infinity-norm. - - L (input) SuperMatrix* - The factor L from the factorization Pr*A*Pc=L*U as computed by - dgstrf(). Use compressed row subscripts storage for supernodes, - i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - - U (input) SuperMatrix* - The factor U from the factorization Pr*A*Pc=L*U as computed by - dgstrf(). Use column-wise storage scheme, i.e., U has types: - Stype = SLU_NC, Dtype = SLU_D, Mtype = TRU. - - ANORM (input) double - If NORM = '1' or 'O', the 1-norm of the original matrix A. - If NORM = 'I', the infinity-norm of the original matrix A. - - RCOND (output) double* - The reciprocal of the condition number of the matrix A, - computed as RCOND = 1/(norm(A) * norm(inv(A))). - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - - ===================================================================== -*/ - - /* Local variables */ - int kase, kase1, onenrm, i; - double ainvnm; - double *work; - int *iwork; - extern int drscl_(int *, double *, double *, int *); - - extern int dlacon_(int *, double *, double *, int *, double *, int *); - - - /* Test the input parameters. */ - *info = 0; - onenrm = *(unsigned char *)norm == '1' || superlu_lsame(norm, "O"); - if (! onenrm && ! superlu_lsame(norm, "I")) *info = -1; - else if (L->nrow < 0 || L->nrow != L->ncol || - L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU) - *info = -2; - else if (U->nrow < 0 || U->nrow != U->ncol || - U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU) - *info = -3; - if (*info != 0) { - i = -(*info); - superlu_xerbla("dgscon", &i); - return; - } - - /* Quick return if possible */ - *rcond = 0.; - if ( L->nrow == 0 || U->nrow == 0) { - *rcond = 1.; - return; - } - - work = doubleCalloc( 3*L->nrow ); - iwork = intMalloc( L->nrow ); - - - if ( !work || !iwork ) - ABORT("Malloc fails for work arrays in dgscon."); - - /* Estimate the norm of inv(A). */ - ainvnm = 0.; - if ( onenrm ) kase1 = 1; - else kase1 = 2; - kase = 0; - - do { - dlacon_(&L->nrow, &work[L->nrow], &work[0], &iwork[0], &ainvnm, &kase); - - if (kase == 0) break; - - if (kase == kase1) { - /* Multiply by inv(L). */ - sp_dtrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); - - /* Multiply by inv(U). */ - sp_dtrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); - - } else { - - /* Multiply by inv(U'). */ - sp_dtrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); - - /* Multiply by inv(L'). */ - sp_dtrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); - - } - - } while ( kase != 0 ); - - /* Compute the estimate of the reciprocal condition number. */ - if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; - - SUPERLU_FREE (work); - SUPERLU_FREE (iwork); - return; - -} /* dgscon */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsequ.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,193 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: dgsequ.c - * History: Modified from LAPACK routine DGEEQU - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - - -#include -#include "slu_ddefs.h" -#ifdef HYPRE_USING_HYPRE_LAPACK -#include "hypre_lapack.h" -#endif - -void -dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int *info) -{ -/* - Purpose - ======= - - DGSEQU computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. - - R (output) double*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) double*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) double* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) double* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) double* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= A->nrow: the i-th row of A is exactly zero - > A->ncol: the (i-M)-th column of A is exactly zero - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - double *Aval; - int i, j, irow; - double rcmin, rcmax; - double bignum, smlnum; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *); - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - superlu_xerbla("dgsequ", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = (NCformat*) A->Store; - Aval = (double*) Astore->nzval; - - /* Get machine constants. */ - smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = 0; i < A->nrow; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - return; - -} /* dgsequ */ - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsrfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsrfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsrfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,447 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: dgsrfs.c - * History: Modified from lapack routine DGERFS - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include "slu_ddefs.h" -#ifdef HYPRE_USING_HYPRE_BLAS -#include "hypre_blas.h" -#endif -#ifdef HYPRE_USING_HYPRE_LAPACK -#include "hypre_lapack.h" -#endif -void -dgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, char *equed, double *R, double *C, - SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * DGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates for - * the solution. - * - * If equilibration was performed, the system becomes: - * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * A (input) SuperMatrix* - * The original matrix A in the system, or the scaled A if - * equilibration was done. The type of A can be: - * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_GE. - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * dgstrf(). Use column-wise storage scheme, - * i.e., U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (A->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * equed (input) Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by - * diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * - * R (input) double*, dimension (A->nrow) - * The row scale factors for A. - * If equed = 'R' or 'B', A is premultiplied by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * - * C (input) double*, dimension (A->ncol) - * The column scale factors for A. - * If equed = 'C' or 'B', A is postmultiplied by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * - * B (input) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * The right hand side matrix B. - * if equed = 'R' or 'B', B is premultiplied by diag(R). - * - * X (input/output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * On entry, the solution matrix X, as computed by dgstrs(). - * On exit, the improved solution matrix X. - * if *equed = 'C' or 'B', X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * FERR (output) double*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * - * BERR (output) double*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if INFO = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 5 - - /* Table of constant values */ - int ione = 1; - double ndone = -1.; - double done = 1.; - - /* Local variables */ - NCformat *Astore; - double *Aval; - SuperMatrix Bjcol; - DNformat *Bstore, *Xstore, *Bjcol_store; - double *Bmat, *Xmat, *Bptr, *Xptr; - int kase; - double safe1, safe2; - int i, j, k, irow, nz, count, notran, rowequ, colequ; - int ldb, ldx, nrhs; - double s, xk, lstres, eps, safmin; - char transc[1]; - trans_t transt; - double *work; - double *rwork; - int *iwork; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *); - extern int dlacon_(int *, double *, double *, int *, double *, int *); -#ifdef _CRAY - extern int SCOPY(int *, double *, int *, double *, int *); - extern int SSAXPY(int *, double *, double *, int *, double *, int *); -#else - extern int hypre_F90_NAME_BLAS(dcopy,DCOPY)(int *,double *,int *,double *,int *); - extern int hypre_F90_NAME_BLAS(daxpy,DAXPY)(int *,double *,double *,int *,double *,int *); -#endif - - Astore = (NCformat*) A->Store; - Aval = ( double*) Astore->nzval; - Bstore = (DNformat*) B->Store; - Xstore = (DNformat*) X->Store; - Bmat = ( double*) Bstore->nzval; - Xmat = ( double*) Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - /* Test the input parameters */ - *info = 0; - notran = (trans == NOTRANS); - if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) - *info = -3; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) - *info = -4; - else if ( ldb < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) - *info = -10; - else if ( ldx < SUPERLU_MAX(0, A->nrow) || - X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) - *info = -11; - if (*info != 0) { - i = -(*info); - superlu_xerbla("dgsrfs", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || nrhs == 0) { - for (j = 0; j < nrhs; ++j) { - ferr[j] = 0.; - berr[j] = 0.; - } - return; - } - - rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B"); - colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B"); - - /* Allocate working space */ - work = doubleMalloc(2*A->nrow); - rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); - iwork = intMalloc(2*A->nrow); - if ( !work || !rwork || !iwork ) - ABORT("Malloc fails for work/rwork/iwork."); - - if ( notran ) { - *(unsigned char *)transc = 'N'; - transt = TRANS; - } else { - *(unsigned char *)transc = 'T'; - transt = NOTRANS; - } - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Epsilon"); - safmin = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - - /* Compute the number of nonzeros in each row (or column) of A */ - for (i = 0; i < A->nrow; ++i) iwork[i] = 0; - if ( notran ) { - for (k = 0; k < A->ncol; ++k) - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - ++iwork[Astore->rowind[i]]; - } else { - for (k = 0; k < A->ncol; ++k) - iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; - } - - /* Copy one column of RHS B into Bjcol. */ - Bjcol.Stype = B->Stype; - Bjcol.Dtype = B->Dtype; - Bjcol.Mtype = B->Mtype; - Bjcol.nrow = B->nrow; - Bjcol.ncol = 1; - Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); - Bjcol_store = (DNformat*) Bjcol.Store; - Bjcol_store->lda = ldb; - Bjcol_store->nzval = work; /* address aliasing */ - - /* Do for each right hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - Bptr = &Bmat[j*ldb]; - Xptr = &Xmat[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - -#ifdef _CRAY - SCOPY(&A->nrow, Bptr, &ione, work, &ione); -#else - hypre_F90_NAME_BLAS(dcopy,DCOPY)(&A->nrow,Bptr,&ione,work,&ione); -#endif - sp_dgemv(transc, ndone, A, Xptr, ione, done, work, ione); - - /* Compute componentwise relative backward error from formula - max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) - where abs(Z) is the componentwise absolute value of the matrix - or vector Z. If the i-th component of the denominator is less - than SAFE2, then SAFE1 is added to the i-th component of the - numerator and denominator before dividing. */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if (notran) { - for (k = 0; k < A->ncol; ++k) { - xk = fabs( Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - s += fabs(Aval[i]) * fabs(Xptr[irow]); - } - rwork[k] += s; - } - } - s = 0.; - for (i = 0; i < A->nrow; ++i) { - if (rwork[i] > safe2) - s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] ); - else - s = SUPERLU_MAX( s, (fabs(work[i]) + safe1) / - (rwork[i] + safe1) ); - } - berr[j] = s; - - /* Test stopping criterion. Continue iterating if - 1) The residual BERR(J) is larger than machine epsilon, and - 2) BERR(J) decreased by at least a factor of 2 during the - last iteration, and - 3) At most ITMAX iterations tried. */ - - if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { - /* Update solution and try again. */ - dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - -#ifdef _CRAY - SAXPY(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#else - hypre_F90_NAME_BLAS(daxpy,DAXPY)(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#endif - lstres = berr[j]; - ++count; - } else { - break; - } - - } /* end while */ - - stat->RefineSteps = count; - - /* Bound error from formula: - norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* - ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) - where - norm(Z) is the magnitude of the largest component of Z - inv(op(A)) is the inverse of op(A) - abs(Z) is the componentwise absolute value of the matrix or - vector Z - NZ is the maximum number of nonzeros in any row of A, plus 1 - EPS is machine epsilon - - The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) - is incremented by SAFE1 if the i-th component of - abs(op(A))*abs(X) + abs(B) is less than SAFE2. - - Use DLACON to estimate the infinity-norm of the matrix - inv(op(A)) * diag(W), - where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if ( notran ) { - for (k = 0; k < A->ncol; ++k) { - xk = fabs( Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - xk = fabs( Xptr[irow] ); - s += fabs(Aval[i]) * xk; - } - rwork[k] += s; - } - } - - for (i = 0; i < A->nrow; ++i) - if (rwork[i] > safe2) - rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; - else - rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; - - kase = 0; - - do { - dlacon_(&A->nrow, &work[A->nrow], work, - &iwork[A->nrow], &ferr[j], &kase); - if (kase == 0) break; - - if (kase == 1) { - /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; - else if ( !notran && rowequ ) - for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; - - dgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); - - for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; - } else { - /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ - for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; - - dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; - else if ( !notran && rowequ ) - for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; - } - - } while ( kase != 0 ); - - - /* Normalize error. */ - lstres = 0.; - if ( notran && colequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) ); - } else if ( !notran && rowequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) ); - } else { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) ); - } - if ( lstres != 0. ) - ferr[j] /= lstres; - - } /* for each RHS j ... */ - - SUPERLU_FREE(work); - SUPERLU_FREE(rwork); - SUPERLU_FREE(iwork); - SUPERLU_FREE(Bjcol.Store); - - return; - -} /* dgsrfs */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssv.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,226 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include "slu_ddefs.h" - -void -dgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, - SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * DGSSV solves the system of linear equations A*X=B, using the - * LU factorization from DGSTRF. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. Permute the columns of A, forming A*Pc, where Pc - * is a permutation matrix. For more details of this step, - * see sp_preorder.c. - * - * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined - * by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 1.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the - * above algorithm to the transpose of A: - * - * 2.1. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix. - * For more details of this step, see sp_preorder.c. - * - * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr - * determined by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 2.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR; Dtype = SLU_D; Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, column permutation vector of size A->ncol - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * If A->Stype = SLU_NR, column permutation vector of size A->nrow - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or - * options->Fact = SamePattern_SameRowPerm, it is an input argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * Otherwise, it is an output argument. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->RowPerm = MY_PERMR or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument. - * otherwise it is an output argument. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - DNformat *Bstore; - SuperMatrix *AA=NULL;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int lwork = 0, *etree, i; - - /* Set default values for some parameters */ - double drop_tol = 0.; - int panel_size; /* panel size */ - int relax; /* no of columns in a relaxed snodes */ - int permc_spec; - trans_t trans = NOTRANS; - double *utime; - double t; /* Temporary time */ - - /* Test the input parameters ... */ - *info = 0; - Bstore = (DNformat*) B->Store; - if ( options->Fact != DOFACT ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) - *info = -7; - if ( *info != 0 ) { - i = -(*info); - superlu_xerbla("dgssv", &i); - return; - } - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = (NRformat *) A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - (double*)Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - trans = TRANS; - } else { - if ( A->Stype == SLU_NC ) AA = A; - } - - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t; - - etree = intMalloc(A->ncol); - - t = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t; - - panel_size = sp_ienv(1); - relax = sp_ienv(2); - - /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4));*/ - t = SuperLU_timer_(); - /* Compute the LU factorization of A. */ - dgstrf(options, &AC, drop_tol, relax, panel_size, - etree, NULL, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t; - - t = SuperLU_timer_(); - if ( *info == 0 ) { - /* Solve the system A*X=B, overwriting B with X. */ - dgstrs (trans, L, U, perm_c, perm_r, B, stat, info); - } - utime[SOLVE] = SuperLU_timer_() - t; - - SUPERLU_FREE (etree); - Destroy_CompCol_Permuted(&AC); - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssvx.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssvx.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssvx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,622 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - - -#include "slu_ddefs.h" -#ifdef HYPRE_USING_HYPRE_LAPACK -#include "hypre_lapack.h" -#endif - -void -dgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - int *etree, char *equed, double *R, double *C, - SuperMatrix *L, SuperMatrix *U, void *work, int lwork, - SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, - double *rcond, double *ferr, double *berr, - mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * DGSSVX solves the system of linear equations A*X=B or A'*X=B, using - * the LU factorization from dgstrf(). Error bounds on the solution and - * a condition estimate are also provided. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A is - * overwritten by diag(R)*A*diag(C) and B by diag(R)*B - * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans - * = TRANS or CONJ). - * - * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation - * matrix that usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 1.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the matrix A (after equilibration if options->Equil = YES) - * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. - * - * 1.4. Compute the reciprocal pivot growth factor. - * - * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form of - * A is used to estimate the condition number of the matrix A. If - * the reciprocal of the condition number is less than machine - * precision, info = A->ncol+1 is returned as a warning, but the - * routine still goes on to solve for X and computes error bounds - * as described below. - * - * 1.6. The system of equations is solved for X using the factored form - * of A. - * - * 1.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 1.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm - * to the transpose of A: - * - * 2.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A' is - * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B - * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). - * - * 2.2. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix that - * usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 2.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the transpose(A) (after equilibration if - * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the - * permutation Pr determined by partial pivoting. - * - * 2.4. Compute the reciprocal pivot growth factor. - * - * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form - * of transpose(A) is used to estimate the condition number of the - * matrix A. If the reciprocal of the condition number - * is less than machine precision, info = A->nrow+1 is returned as - * a warning, but the routine still goes on to solve for X and - * computes error bounds as described below. - * - * 2.6. The system of equations is solved for X using the factored form - * of transpose(A). - * - * 2.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 2.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input/output) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * On entry, If options->Fact = FACTORED and equed is not 'N', - * then A must have been equilibrated by the scaling factors in - * R and/or C. - * On exit, A is not modified if options->Equil = NO, or if - * options->Equil = YES but equed = 'N' on exit. - * Otherwise, if options->Equil = YES and equed is not 'N', - * A is scaled as follows: - * If A->Stype = SLU_NC: - * equed = 'R': A := diag(R) * A - * equed = 'C': A := A * diag(C) - * equed = 'B': A := diag(R) * A * diag(C). - * If A->Stype = SLU_NR: - * equed = 'R': transpose(A) := diag(R) * transpose(A) - * equed = 'C': transpose(A) := transpose(A) * diag(C) - * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * - * If A->Stype = SLU_NR, column permutation vector of size A->nrow, - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by a - * new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument. - * - * etree (input/output) int*, dimension (A->ncol) - * Elimination tree of Pc'*A'*A*Pc. - * If options->Fact != FACTORED and options->Fact != DOFACT, - * etree is an input argument, otherwise it is an output argument. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * equed (input/output) char* - * Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * If options->Fact = FACTORED, equed is an input argument, - * otherwise it is an output argument. - * - * R (input/output) double*, dimension (A->nrow) - * The row scale factors for A or transpose(A). - * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * If options->Fact = FACTORED, R is an input argument, - * otherwise, R is output. - * If options->zFact = FACTORED and equed = 'R' or 'B', each element - * of R must be positive. - * - * C (input/output) double*, dimension (A->ncol) - * The column scale factors for A or transpose(A). - * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * If options->Fact = FACTORED, C is an input argument, - * otherwise, C is output. - * If options->Fact = FACTORED and equed = 'C' or 'B', each element - * of C must be positive. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. - * - * work (workspace/output) void*, size (lwork) (in bytes) - * User supplied workspace, should be large enough - * to hold data structures for factors L and U. - * On exit, if fact is not 'F', L and U point to this array. - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * mem_usage->total_needed; no other side effects. - * - * See argument 'mem_usage' for memory usage statistics. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * If B->ncol = 0, only LU decomposition is performed, the triangular - * solve is skipped. - * On exit, - * if equed = 'N', B is not modified; otherwise - * if A->Stype = SLU_NC: - * if options->Trans = NOTRANS and equed = 'R' or 'B', - * B is overwritten by diag(R)*B; - * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', - * B is overwritten by diag(C)*B; - * if A->Stype = SLU_NR: - * if options->Trans = NOTRANS and equed = 'C' or 'B', - * B is overwritten by diag(C)*B; - * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', - * B is overwritten by diag(R)*B. - * - * X (output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * If info = 0 or info = A->ncol+1, X contains the solution matrix - * to the original system of equations. Note that A and B are modified - * on exit if equed is not 'N', and the solution to the equilibrated - * system is inv(diag(C))*X if options->Trans = NOTRANS and - * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' - * and equed = 'R' or 'B'. - * - * recip_pivot_growth (output) double* - * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). - * The infinity norm is used. If recip_pivot_growth is much less - * than 1, the stability of the LU factorization could be poor. - * - * rcond (output) double* - * The estimate of the reciprocal condition number of the matrix A - * after equilibration (if done). If rcond is less than the machine - * precision (in particular, if rcond = 0), the matrix is singular - * to working precision. This condition is indicated by a return - * code of info > 0. - * - * FERR (output) double*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * If options->IterRefine = NOREFINE, ferr = 1.0. - * - * BERR (output) double*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * If options->IterRefine = NOREFINE, berr = 1.0. - * - * mem_usage (output) mem_usage_t* - * Record the memory usage statistics, consisting of following fields: - * - for_lu (float) - * The amount of space used in bytes for L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * The number of memory expansions during the LU factorization. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly - * singular, so the solution and error bounds - * could not be computed. - * = A->ncol+1: U is nonsingular, but RCOND is less than machine - * precision, meaning that the matrix is singular to - * working precision. Nevertheless, the solution and - * error bounds are computed because there are a number - * of situations where the computed solution can be more - * accurate than the value of RCOND would suggest. - * > A->ncol+1: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - - DNformat *Bstore, *Xstore; - double *Bmat, *Xmat; - int ldb, ldx, nrhs; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int colequ, equil, nofact, notran, rowequ, permc_spec; - trans_t trant; - char norm[1]; - int i, j, info1; - double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; - int relax, panel_size; - double drop_tol; - double t0; /* temporary time */ - double *utime; - - /* External functions */ - extern double dlangs(char *, SuperMatrix *); - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *); - - Bstore = (DNformat*) B->Store; - Xstore = (DNformat*) X->Store; - Bmat = ( double*) Bstore->nzval; - Xmat = ( double*) Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - *info = 0; - nofact = (options->Fact != FACTORED); - equil = (options->Equil == YES); - notran = (options->Trans == NOTRANS); - if ( nofact ) { - *(unsigned char *)equed = 'N'; - rowequ = FALSE; - colequ = FALSE; - } else { - rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B"); - colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B"); - smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum"); - bignum = 1. / smlnum; - } - -#if 0 -printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", - options->Fact, options->Trans, *equed); -#endif - - /* Test the input parameters */ - if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && - options->Fact != SamePattern_SameRowPerm && - !notran && options->Trans != TRANS && options->Trans != CONJ && - !equil && options->Equil != NO) - *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -2; - else if (options->Fact == FACTORED && - !(rowequ || colequ || superlu_lsame(equed, "N"))) - *info = -6; - else { - if (rowequ) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, R[j]); - rcmax = SUPERLU_MAX(rcmax, R[j]); - } - if (rcmin <= 0.) *info = -7; - else if ( A->nrow > 0) - rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else rowcnd = 1.; - } - if (colequ && *info == 0) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, C[j]); - rcmax = SUPERLU_MAX(rcmax, C[j]); - } - if (rcmin <= 0.) *info = -8; - else if (A->nrow > 0) - colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else colcnd = 1.; - } - if (*info == 0) { - if ( lwork < -1 ) *info = -12; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_D || - B->Mtype != SLU_GE ) - *info = -13; - else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || - (B->ncol != 0 && B->ncol != X->ncol) || - X->Stype != SLU_DN || - X->Dtype != SLU_D || X->Mtype != SLU_GE ) - *info = -14; - } - } - if (*info != 0) { - i = -(*info); - superlu_xerbla("dgssvx", &i); - return; - } - - /* Initialization for factor parameters */ - panel_size = sp_ienv(1); - relax = sp_ienv(2); - drop_tol = 0.0; - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = (NRformat*) A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - (double*) Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - if ( notran ) { /* Reverse the transpose argument. */ - trant = TRANS; - notran = 0; - } else { - trant = NOTRANS; - notran = 1; - } - } else { /* A->Stype == SLU_NC */ - trant = options->Trans; - AA = A; - } - - if ( nofact && equil ) { - t0 = SuperLU_timer_(); - /* Compute row and column scalings to equilibrate the matrix A. */ - dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); - - if ( info1 == 0 ) { - /* Equilibrate matrix A. */ - dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); - rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B"); - colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B"); - } - utime[EQUIL] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Scale the right hand side if equilibration was performed. */ - if ( notran ) { - if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Bmat[i + j*ldb] *= R[i]; - } - } - } else if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Bmat[i + j*ldb] *= C[i]; - } - } - } - - if ( nofact ) { - - t0 = SuperLU_timer_(); - /* - * Gnet column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t0; - - t0 = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t0; - -/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4)); - fflush(stdout); */ - - /* Compute the LU factorization of A*Pc. */ - t0 = SuperLU_timer_(); - dgstrf(options, &AC, drop_tol, relax, panel_size, - etree, work, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t0; - - if ( lwork == -1 ) { - mem_usage->total_needed = *info - A->ncol; - return; - } - } - - if ( options->PivotGrowth ) { - if ( *info > 0 ) { - if ( *info <= A->ncol ) { - /* Compute the reciprocal pivot growth factor of the leading - rank-deficient *info columns of A. */ - *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U); - } - return; - } - - /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ - *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); - } - - if ( options->ConditionNumber ) { - /* Estimate the reciprocal of the condition number of A. */ - t0 = SuperLU_timer_(); - if ( notran ) { - *(unsigned char *)norm = '1'; - } else { - *(unsigned char *)norm = 'I'; - } - anorm = dlangs(norm, AA); - dgscon(norm, L, U, anorm, rcond, stat, info); - utime[RCOND] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Compute the solution matrix X. */ - for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ - for (i = 0; i < B->nrow; i++) - Xmat[i + j*ldx] = Bmat[i + j*ldb]; - - t0 = SuperLU_timer_(); - dgstrs (trant, L, U, perm_c, perm_r, X, stat, info); - utime[SOLVE] = SuperLU_timer_() - t0; - - /* Use iterative refinement to improve the computed solution and compute - error bounds and backward error estimates for it. */ - t0 = SuperLU_timer_(); - if ( options->IterRefine != NOREFINE ) { - dgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, - X, ferr, berr, stat, info); - } else { - for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; - } - utime[REFINE] = SuperLU_timer_() - t0; - - /* Transform the solution matrix X to a solution of the original system. */ - if ( notran ) { - if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Xmat[i + j*ldx] *= C[i]; - } - } - } else if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Xmat[i + j*ldx] *= R[i]; - } - } - } /* end if nrhs > 0 */ - - if ( options->ConditionNumber ) { - /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ - if (*rcond < hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("E")) *info=A->ncol+1; - } - - if ( nofact ) { - dQuerySpace(L, U, mem_usage); - Destroy_CompCol_Permuted(&AC); - } - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrf.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,433 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -void -dgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol, - int relax, int panel_size, int *etree, void *work, int lwork, - int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * DGSTRF computes an LU factorization of a general sparse m-by-n - * matrix A using partial pivoting with row interchanges. - * The factorization has the form - * Pr * A = L * U - * where Pr is a row permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper - * triangular (upper trapezoidal if A->nrow < A->ncol). - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. - * - * drop_tol (input) double (NOT IMPLEMENTED) - * Drop tolerance parameter. At step j of the Gaussian elimination, - * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. - * 0 <= drop_tol <= 1. The default value of drop_tol is 0. - * - * relax (input) int - * To control degree of relaxing supernodes. If the number - * of nodes (columns) in a subtree of the elimination tree is less - * than relax, this subtree is considered as one supernode, - * regardless of the row structures of those columns. - * - * panel_size (input) int - * A panel consists of at most panel_size consecutive columns. - * - * etree (input) int*, dimension (A->ncol) - * Elimination tree of A'*A. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * On input, the columns of A should be permuted so that the - * etree is in a certain postorder. - * - * work (input/output) void*, size (lwork) (in bytes) - * User-supplied work space and space for the output data structures. - * Not referenced if lwork = 0; - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * *info; no other side effects. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * When searching for diagonal, perm_c[*] is applied to the - * row subscripts of A, so that diagonal threshold pivoting - * can find the diagonal of A, rather than that of A*Pc. - * - * perm_r (input/output) int*, dimension (A->nrow) - * Row permutation vector which defines the permutation matrix Pr, - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by - * a new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument; - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = SLU_NC, - * Dtype = SLU_D, Mtype = SLU_TRU. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. If lwork = -1, it is - * the estimated amount of space needed, plus A->ncol. - * - * ====================================================================== - * - * Local Working Arrays: - * ====================== - * m = number of rows in the matrix - * n = number of columns in the matrix - * - * xprune[0:n-1]: xprune[*] points to locations in subscript - * vector lsub[*]. For column i, xprune[i] denotes the point where - * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need - * to be traversed for symbolic factorization. - * - * marker[0:3*m-1]: marker[i] = j means that node i has been - * reached when working on column j. - * Storage: relative to original row subscripts - * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, - * see dpanel_dfs.c; marker2 is used for inner-factorization, - * see dcolumn_dfs.c. - * - * parent[0:m-1]: parent vector used during dfs - * Storage: relative to new row subscripts - * - * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) - * unexplored neighbor of i in lsub[*] - * - * segrep[0:nseg-1]: contains the list of supernodal representatives - * in topological order of the dfs. A supernode representative is the - * last column of a supernode. - * The maximum size of segrep[] is n. - * - * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a - * supernodal representative r, repfnz[r] is the location of the first - * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 - * indicates the supernode r has been explored. - * NOTE: There are W of them, each used for one column of a panel. - * - * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below - * the panel diagonal. These are filled in during dpanel_dfs(), and are - * used later in the inner LU factorization within the panel. - * panel_lsub[]/dense[] pair forms the SPA data structure. - * NOTE: There are W of them. - * - * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; - * NOTE: there are W of them. - * - * tempv[0:*]: real temporary used for dense numeric kernels; - * The size of this array is defined by NUM_TEMPV() in dsp_defs.h. - * - */ - /* Local working arrays */ - NCPformat *Astore; - int *iperm_r = NULL; /* inverse of perm_r; used when - options->Fact == SamePattern_SameRowPerm */ - int *iperm_c; /* inverse of perm_c */ - int *iwork; - double *dwork; - int *segrep, *repfnz, *parent, *xplore; - int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ - int *xprune; - int *marker; - double *dense, *tempv; - int *relax_end; - double *a; - int *asub; - int *xa_begin, *xa_end; - int *xsup, *supno; - int *xlsub, *xlusup, *xusub; - int nzlumax; - static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ - - /* Local scalars */ - fact_t fact = options->Fact; - double diag_pivot_thresh = options->DiagPivotThresh; - int pivrow; /* pivotal row number in the original matrix A */ - int nseg1; /* no of segments in U-column above panel row jcol */ - int nseg; /* no of segments in each U-column */ - register int jcol; - register int kcol; /* end column of a relaxed snode */ - register int icol; - register int i, k, jj, new_next, iinfo; - int m, n, min_mn, jsupno, fsupc, nextlu, nextu; - int w_def; /* upper bound on panel width */ - int usepr, iperm_r_allocated = 0; - int nnzL, nnzU; - int *panel_histo = stat->panel_histo; - flops_t *ops = stat->ops; - - iinfo = 0; - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN(m, n); - Astore = (NCPformat*) A->Store; - a = ( double*) Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - - /* Allocate storage common to the factor routines */ - *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz, - panel_size, L, U, &Glu, &iwork, &dwork); - if ( *info ) return; - - xsup = Glu.xsup; - supno = Glu.supno; - xlsub = Glu.xlsub; - xlusup = Glu.xlusup; - xusub = Glu.xusub; - - SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, - &repfnz, &panel_lsub, &xprune, &marker); - dSetRWork(m, panel_size, dwork, &dense, &tempv); - - usepr = (fact == SamePattern_SameRowPerm); - if ( usepr ) { - /* Compute the inverse of perm_r */ - iperm_r = (int *) intMalloc(m); - for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; - iperm_r_allocated = 1; - } - iperm_c = (int *) intMalloc(n); - for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; - - /* Identify relaxed snodes */ - relax_end = (int *) intMalloc(n); - if ( options->SymmetricMode == YES ) { - heap_relax_snode(n, etree, relax, marker, relax_end); - } else { - relax_snode(n, etree, relax, marker, relax_end); - } - - ifill (perm_r, m, EMPTY); - ifill (marker, m * NO_MARKER, EMPTY); - supno[0] = -1; - xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; - w_def = panel_size; - - /* - * Work on one "panel" at a time. A panel is one of the following: - * (a) a relaxed supernode at the bottom of the etree, or - * (b) panel_size contiguous columns, defined by the user - */ - for (jcol = 0; jcol < min_mn; ) { - - if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ - kcol = relax_end[jcol]; /* end of the relaxed snode */ - panel_histo[kcol-jcol+1]++; - - /* -------------------------------------- - * Factorize the relaxed supernode(jcol:kcol) - * -------------------------------------- */ - /* Determine the union of the row structure of the snode */ - if ( (*info = dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, - xprune, marker, &Glu)) != 0 ) - return; - - nextu = xusub[jcol]; - nextlu = xlusup[jcol]; - jsupno = supno[jcol]; - fsupc = xsup[jsupno]; - new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); - nzlumax = Glu.nzlumax; - while ( new_next > nzlumax ) { - if ( (*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) - return; - } - - for (icol = jcol; icol<= kcol; icol++) { - xusub[icol+1] = nextu; - - /* Scatter into SPA dense[*] */ - for (k = xa_begin[icol]; k < xa_end[icol]; k++) - dense[asub[k]] = a[k]; - - /* Numeric update within the snode */ - dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); - - if ( (*info = dpivotL(icol, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - -#ifdef DEBUG - dprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); -#endif - - } - - jcol = icol; - - } else { /* Work on one panel of panel_size columns */ - - /* Adjust panel_size so that a panel won't overlap with the next - * relaxed snode. - */ - panel_size = w_def; - for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) - if ( relax_end[k] != EMPTY ) { - panel_size = k - jcol; - break; - } - if ( k == min_mn ) panel_size = min_mn - jcol; - panel_histo[panel_size]++; - - /* symbolic factor on a panel of columns */ - dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, - dense, panel_lsub, segrep, repfnz, xprune, - marker, parent, xplore, &Glu); - - /* numeric sup-panel updates in topological order */ - dpanel_bmod(m, panel_size, jcol, nseg1, dense, - tempv, segrep, repfnz, &Glu, stat); - - /* Sparse LU within the panel, and below panel diagonal */ - for ( jj = jcol; jj < jcol + panel_size; jj++) { - k = (jj - jcol) * m; /* column index for w-wide arrays */ - - nseg = nseg1; /* Begin after all the panel segments */ - - if ((*info = dcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], - segrep, &repfnz[k], xprune, marker, - parent, xplore, &Glu)) != 0) return; - - /* Numeric updates */ - if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k], - tempv, &segrep[nseg1], &repfnz[k], - jcol, &Glu, stat)) != 0) return; - - /* Copy the U-segments to ucol[*] */ - if ((*info = dcopy_to_ucol(jj, nseg, segrep, &repfnz[k], - perm_r, &dense[k], &Glu)) != 0) - return; - - if ( (*info = dpivotL(jj, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - - /* Prune columns (0:jj-1) using column jj */ - dpruneL(jj, perm_r, pivrow, nseg, segrep, - &repfnz[k], xprune, &Glu); - - /* Reset repfnz[] for this column */ - resetrep_col (nseg, segrep, &repfnz[k]); - -#ifdef DEBUG - dprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); -#endif - - } - - jcol += panel_size; /* Move to the next panel */ - - } /* else */ - - } /* for */ - - *info = iinfo; - - if ( m > n ) { - k = 0; - for (i = 0; i < m; ++i) - if ( perm_r[i] == EMPTY ) { - perm_r[i] = n + k; - ++k; - } - } - - countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); - fixupL(min_mn, perm_r, &Glu); - - dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */ - - if ( fact == SamePattern_SameRowPerm ) { - /* L and U structures may have changed due to possibly different - pivoting, even though the storage is available. - There could also be memory expansions, so the array locations - may have changed, */ - ((SCformat *)L->Store)->nnz = nnzL; - ((SCformat *)L->Store)->nsuper = Glu.supno[n]; - ((SCformat *)L->Store)->nzval = Glu.lusup; - ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; - ((SCformat *)L->Store)->rowind = Glu.lsub; - ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; - ((NCformat *)U->Store)->nnz = nnzU; - ((NCformat *)U->Store)->nzval = Glu.ucol; - ((NCformat *)U->Store)->rowind = Glu.usub; - ((NCformat *)U->Store)->colptr = Glu.xusub; - } else { - dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, - Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, - Glu.xsup, SLU_SC, SLU_D, SLU_TRLU); - dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, - Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU); - } - - ops[FACT] += ops[TRSV] + ops[GEMV]; - - if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); - SUPERLU_FREE (iperm_c); - SUPERLU_FREE (relax_end); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,341 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include "slu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif - -/* - * Function prototypes - */ -void sludusolve(int, int, double*, double*); -void sludlsolve(int, int, double*, double*); -void sludmatvec(int, int, int, double*, double*, double*); - -void -dgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, SuperMatrix *B, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * DGSTRS solves a system of linear equations A*X=B or A'*X=B - * with A sparse and B dense, using the LU factorization computed by - * DGSTRF. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U as computed by - * dgstrf(). Use compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * dgstrf(). Use column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (L->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (L->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ -#ifdef _CRAY - _fcd ftcs1, ftcs2, ftcs3, ftcs4; -#endif -#if 0 - int incx = 1, incy = 1; -#endif -#ifdef USE_VENDOR_BLAS - double alpha = 1.0, beta = 1.0; - double *work_col; -#endif - DNformat *Bstore; - double *Bmat; - SCformat *Lstore; - NCformat *Ustore; - double *Lval, *Uval; - int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; - int i, j, k, iptr, jcol, n, ldb, nrhs; - double *work, *rhs_work, *soln; - flops_t solve_ops; - void dprint_soln(); - - /* Test input parameters ... */ - *info = 0; - Bstore = (DNformat*) B->Store; - ldb = Bstore->lda; - nrhs = B->ncol; - if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) - *info = -2; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) - *info = -3; - else if ( ldb < SUPERLU_MAX(0, L->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) - *info = -6; - if ( *info ) { - i = -(*info); - superlu_xerbla("dgstrs", &i); - return; - } - - n = L->nrow; - work = doubleCalloc(n * nrhs); - if ( !work ) ABORT("Malloc fails for local work[]."); - soln = doubleMalloc(n); - if ( !soln ) ABORT("Malloc fails for local soln[]."); - - Bmat = (double*) Bstore->nzval; - Lstore = (SCformat*) L->Store; - Lval = (double*)Lstore->nzval; - Ustore = (NCformat*) U->Store; - Uval = (double*)Ustore->nzval; - solve_ops = 0; - - if ( trans == NOTRANS ) { - /* Permute right hand sides to form Pr*B */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - /* Forward solve PLy=Pb. */ - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - nrow = nsupr - nsupc; - - solve_ops += nsupc * (nsupc - 1) * nrhs; - solve_ops += 2 * nrow * nsupc * nrhs; - - if ( nsupc == 1 ) { - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - luptr = L_NZ_START(fsupc); - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ - irow = L_SUB(iptr); - ++luptr; - rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; - } - } - } else { - luptr = L_NZ_START(fsupc); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L","L","N","U",&nsupc, - &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], - &ldb); - hypre_F90_NAME_BLAS(dgemm,DGEMM)("N","N",&nrow,&nrhs,&nsupc, - &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#endif - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - work_col = &work[j*n]; - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - rhs_work[irow] -= work_col[i]; /* Scatter */ - work_col[i] = 0.0; - iptr++; - } - } -#else - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - sludlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); - sludmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], - &rhs_work[fsupc], &work[0] ); - - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - rhs_work[irow] -= work[i]; - work[i] = 0.0; - iptr++; - } - } -#endif - } /* else ... */ - } /* for L-solve */ - -#ifdef DEBUG - printf("After L-solve: y=\n"); - dprint_soln(n, nrhs, Bmat); -#endif - - /* - * Back solve Ux=y. - */ - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += nsupc * (nsupc + 1) * nrhs; - - if ( nsupc == 1 ) { - rhs_work = &Bmat[0]; - for (j = 0; j < nrhs; j++) { - rhs_work[fsupc] /= Lval[luptr]; - rhs_work += ldb; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("U", strlen("U")); - ftcs3 = _cptofcd("N", strlen("N")); - STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#else - hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L","U","N","N", &nsupc, - &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], - &ldb); -#endif -#else - for (j = 0; j < nrhs; j++) - sludusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); -#endif - } - - for (j = 0; j < nrhs; ++j) { - rhs_work = &Bmat[j*ldb]; - for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ - irow = U_SUB(i); - rhs_work[irow] -= rhs_work[jcol] * Uval[i]; - } - } - } - - } /* for U-solve */ - -#ifdef DEBUG - printf("After U-solve: x=\n"); - dprint_soln(n, nrhs, Bmat); -#endif - - /* Compute the final solution X := Pc*X. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = solve_ops; - - } else { /* Solve A'*X=B or CONJ(A)*X=B */ - /* Permute right hand sides to form Pc'*B. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = 0; - for (k = 0; k < nrhs; ++k) { - - /* Multiply by inv(U'). */ - sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); - - /* Multiply by inv(L'). */ - sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); - - } - /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - } - - SUPERLU_FREE(work); - SUPERLU_FREE(soln); -} - -/* - * Diagnostic print of the solution vector - */ -void -dprint_soln(int n, int nrhs, double *soln) -{ - int i; - - for (i = 0; i < n; i++) - printf("\t%d: %.4f\n", i, soln[i]); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrsL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrsL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrsL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrsL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,233 +0,0 @@ - - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * September 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" -#include "slu_util.h" - - -/* - * Function prototypes - */ -void dusolve(int, int, double*, double*); -void dlsolve(int, int, double*, double*); -void dmatvec(int, int, int, double*, double*, double*); - - -void -dgstrsL(char *trans, SuperMatrix *L, int *perm_r, SuperMatrix *B, int *info) -{ -/* - * Purpose - * ======= - * - * DGSTRSL only performs the L-solve using the LU factorization computed - * by DGSTRF. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) char* - * Specifies the form of the system of equations: - * = 'N': A * X = B (No transpose) - * = 'T': A'* X = B (Transpose) - * = 'C': A**H * X = B (Conjugate transpose) - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U as computed by - * dgstrf(). Use compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * dgstrf(). Use column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. - * - * perm_r (input) int*, dimension (L->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ -#ifdef _CRAY - _fcd ftcs1, ftcs2, ftcs3, ftcs4; -#endif - int incx = 1, incy = 1; - double alpha = 1.0, beta = 1.0; - DNformat *Bstore; - double *Bmat; - SCformat *Lstore; - double *Lval, *Uval; - int nrow, notran; - int fsupc, nsupr, nsupc, luptr, istart, irow; - int i, j, k, iptr, jcol, n, ldb, nrhs; - double *work, *work_col, *rhs_work, *soln; - flops_t solve_ops; - extern SuperLUStat_t SuperLUStat; - void dprint_soln(); - - /* Test input parameters ... */ - *info = 0; - Bstore = B->Store; - ldb = Bstore->lda; - nrhs = B->ncol; - notran = lsame_(trans, "N"); - if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C") ) *info = -1; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) - *info = -2; - else if ( ldb < SUPERLU_MAX(0, L->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) - *info = -4; - if ( *info ) { - i = -(*info); - xerbla_("dgstrsL", &i); - return; - } - - n = L->nrow; - work = doubleCalloc(n * nrhs); - if ( !work ) ABORT("Malloc fails for local work[]."); - soln = doubleMalloc(n); - if ( !soln ) ABORT("Malloc fails for local soln[]."); - - Bmat = Bstore->nzval; - Lstore = L->Store; - Lval = Lstore->nzval; - solve_ops = 0; - - if ( notran ) { - /* Permute right hand sides to form Pr*B */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - /* Forward solve PLy=Pb. */ - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - nrow = nsupr - nsupc; - - solve_ops += nsupc * (nsupc - 1) * nrhs; - solve_ops += 2 * nrow * nsupc * nrhs; - - if ( nsupc == 1 ) { - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - luptr = L_NZ_START(fsupc); - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ - irow = L_SUB(iptr); - ++luptr; - rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; - } - } - } else { - luptr = L_NZ_START(fsupc); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#else - dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#endif - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - work_col = &work[j*n]; - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - rhs_work[irow] -= work_col[i]; /* Scatter */ - work_col[i] = 0.0; - iptr++; - } - } -#else - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); - dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], - &rhs_work[fsupc], &work[0] ); - - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - rhs_work[irow] -= work[i]; - work[i] = 0.0; - iptr++; - } - } -#endif - } /* else ... */ - } /* for L-solve */ - -#ifdef DEBUG - printf("After L-solve: y=\n"); - dprint_soln(n, nrhs, Bmat); -#endif - - SuperLUStat.ops[SOLVE] = solve_ops; - - } else { - printf("Transposed solve not implemented.\n"); - exit(0); - } - - SUPERLU_FREE(work); - SUPERLU_FREE(soln); -} - -/* - * Diagnostic print of the solution vector - */ -void -dprint_soln(int n, int nrhs, double *soln) -{ - int i; - - for (i = 0; i < n; i++) - printf("\t%d: %.4f\n", i, soln[i]); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlacon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlacon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlacon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,237 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include -#ifdef HYPRE_USING_HYPRE_BLAS -#include "hypre_blas.h" -#endif -/*#include "slu_Cnames.h"*/ - -int -dlacon_(int *n, double *v, double *x, int *isgn, double *est, int *kase) - -{ -/* - Purpose - ======= - - DLACON estimates the 1-norm of a square matrix A. - Reverse communication is used for evaluating matrix-vector products. - - - Arguments - ========= - - N (input) INT - The order of the matrix. N >= 1. - - V (workspace) DOUBLE PRECISION array, dimension (N) - On the final return, V = A*W, where EST = norm(V)/norm(W) - (W is not returned). - - X (input/output) DOUBLE PRECISION array, dimension (N) - On an intermediate return, X should be overwritten by - A * X, if KASE=1, - A' * X, if KASE=2, - and DLACON must be re-called with all the other parameters - unchanged. - - ISGN (workspace) INT array, dimension (N) - - EST (output) DOUBLE PRECISION - An estimate (a lower bound) for norm(A). - - KASE (input/output) INT - On the initial call to DLACON, KASE should be 0. - On an intermediate return, KASE will be 1 or 2, indicating - whether X should be overwritten by A * X or A' * X. - On the final return from DLACON, KASE will again be 0. - - Further Details - ======= ======= - - Contributed by Nick Higham, University of Manchester. - Originally named CONEST, dated March 16, 1988. - - Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of - a real or complex matrix, with applications to condition estimation", - ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. - ===================================================================== -*/ - - /* Table of constant values */ - int c__1 = 1; - double zero = 0.0; - double one = 1.0; - - /* Local variables */ - static int iter; - static int jump, jlast; - static double altsgn, estold; - static int i, j; - double temp; -#ifdef _CRAY - extern int ISAMAX(int *, double *, int *); - extern double SASUM(int *, double *, int *); - extern int SCOPY(int *, double *, int *, double *, int *); -#else - extern int hypre_F90_NAME_BLAS(idamax,IDAMAX)(int *, double *, int *); - extern double hypre_F90_NAME_BLAS(dasum,DASUM)(int *, double *, int *); - extern int hypre_F90_NAME_BLAS(dcopy,DCOPY)(int *, double *, int *, double *, int *); -#endif -#define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a)) /* Copy sign */ -#define i_dnnt(a) \ - ( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */ - - if ( *kase == 0 ) { - for (i = 0; i < *n; ++i) { - x[i] = 1. / (double) (*n); - } - *kase = 1; - jump = 1; - return 0; - } - - switch (jump) { - case 1: goto L20; - case 2: goto L40; - case 3: goto L70; - case 4: goto L110; - case 5: goto L140; - } - - /* ................ ENTRY (JUMP = 1) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ - L20: - if (*n == 1) { - v[0] = x[0]; - *est = fabs(v[0]); - /* ... QUIT */ - goto L150; - } -#ifdef _CRAY - *est = SASUM(n, x, &c__1); -#else - *est = hypre_F90_NAME_BLAS(dasum,DASUM)(n, x, &c__1); -#endif - - for (i = 0; i < *n; ++i) { - x[i] = d_sign(one, x[i]); - isgn[i] = i_dnnt(x[i]); - } - *kase = 2; - jump = 2; - return 0; - - /* ................ ENTRY (JUMP = 2) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ -L40: -#ifdef _CRAY - j = ISAMAX(n, &x[0], &c__1); -#else - j = hypre_F90_NAME_BLAS(idamax,IDAMAX)(n, &x[0], &c__1); -#endif - --j; - iter = 2; - - /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ -L50: - for (i = 0; i < *n; ++i) x[i] = zero; - x[j] = one; - *kase = 1; - jump = 3; - return 0; - - /* ................ ENTRY (JUMP = 3) - X HAS BEEN OVERWRITTEN BY A*X. */ -L70: -#ifdef _CRAY - SCOPY(n, x, &c__1, v, &c__1); -#else - hypre_F90_NAME_BLAS(dcopy,DCOPY)(n, x, &c__1, v, &c__1); -#endif - estold = *est; -#ifdef _CRAY - *est = SASUM(n, v, &c__1); -#else - *est = hypre_F90_NAME_BLAS(dasum,DASUM)(n, v, &c__1); -#endif - - for (i = 0; i < *n; ++i) - if (i_dnnt(d_sign(one, x[i])) != isgn[i]) - goto L90; - - /* REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */ - goto L120; - -L90: - /* TEST FOR CYCLING. */ - if (*est <= estold) goto L120; - - for (i = 0; i < *n; ++i) { - x[i] = d_sign(one, x[i]); - isgn[i] = i_dnnt(x[i]); - } - *kase = 2; - jump = 4; - return 0; - - /* ................ ENTRY (JUMP = 4) - X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ -L110: - jlast = j; -#ifdef _CRAY - j = ISAMAX(n, &x[0], &c__1); -#else - j = hypre_F90_NAME_BLAS(idamax,IDAMAX)(n, &x[0], &c__1); -#endif - --j; - if (x[jlast] != fabs(x[j]) && iter < 5) { - ++iter; - goto L50; - } - - /* ITERATION COMPLETE. FINAL STAGE. */ -L120: - altsgn = 1.; - for (i = 1; i <= *n; ++i) { - x[i-1] = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); - altsgn = -altsgn; - } - *kase = 1; - jump = 5; - return 0; - - /* ................ ENTRY (JUMP = 5) - X HAS BEEN OVERWRITTEN BY A*X. */ -L140: -#ifdef _CRAY - temp = SASUM(n, x, &c__1) / (double)(*n * 3) * 2.; -#else - temp = hypre_F90_NAME_BLAS(dasum,DASUM)(n,x,&c__1)/(double)(*n * 3) * 2.; -#endif - if (temp > *est) { -#ifdef _CRAY - SCOPY(n, &x[0], &c__1, &v[0], &c__1); -#else - hypre_F90_NAME_BLAS(dcopy,DCOPY)(n, &x[0], &c__1, &v[0], &c__1); -#endif - *est = temp; - } - -L150: - *kase = 0; - return 0; - -} /* dlacon_ */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlamch.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlamch.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlamch.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,965 +0,0 @@ -#include -#include "slu_Cnames.h" - -#define TRUE_ (1) -#define FALSE_ (0) -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) - -double dlamch_(char *cmach) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - Purpose - ======= - - DLAMCH determines double precision machine parameters. - - Arguments - ========= - - CMACH (input) CHARACTER*1 - Specifies the value to be returned by DLAMCH: - = 'E' or 'e', DLAMCH := eps - = 'S' or 's , DLAMCH := sfmin - = 'B' or 'b', DLAMCH := base - = 'P' or 'p', DLAMCH := eps*base - = 'N' or 'n', DLAMCH := t - = 'R' or 'r', DLAMCH := rnd - = 'M' or 'm', DLAMCH := emin - = 'U' or 'u', DLAMCH := rmin - = 'L' or 'l', DLAMCH := emax - = 'O' or 'o', DLAMCH := rmax - - where - - eps = relative machine precision - sfmin = safe minimum, such that 1/sfmin does not overflow - base = base of the machine - prec = eps*base - t = number of (base) digits in the mantissa - rnd = 1.0 when rounding occurs in addition, 0.0 otherwise - emin = minimum exponent before (gradual) underflow - rmin = underflow threshold - base**(emin-1) - emax = largest exponent before overflow - rmax = overflow threshold - (base**emax)*(1-eps) - - ===================================================================== -*/ - - static int first = TRUE_; - - /* System generated locals */ - int i__1; - double ret_val; - /* Builtin functions */ - double pow_di(double *, int *); - /* Local variables */ - static double base; - static int beta; - static double emin, prec, emax; - static int imin, imax; - static int lrnd; - static double rmin, rmax, t, rmach; - extern int lsame_(char *, char *); - static double small, sfmin; - extern /* Subroutine */ int dlamc2_(int *, int *, int *, - double *, int *, double *, int *, double *); - static int it; - static double rnd, eps; - - if (first) { - first = FALSE_; - dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); - base = (double) beta; - t = (double) it; - if (lrnd) { - rnd = 1.; - i__1 = 1 - it; - eps = pow_di(&base, &i__1) / 2; - } else { - rnd = 0.; - i__1 = 1 - it; - eps = pow_di(&base, &i__1); - } - prec = eps * base; - emin = (double) imin; - emax = (double) imax; - sfmin = rmin; - small = 1. / rmax; - if (small >= sfmin) { - - /* Use SMALL plus a bit, to avoid the possibility of rounding - causing overflow when computing 1/sfmin. */ - sfmin = small * (eps + 1.); - } - } - - if (lsame_(cmach, "E")) { - rmach = eps; - } else if (lsame_(cmach, "S")) { - rmach = sfmin; - } else if (lsame_(cmach, "B")) { - rmach = base; - } else if (lsame_(cmach, "P")) { - rmach = prec; - } else if (lsame_(cmach, "N")) { - rmach = t; - } else if (lsame_(cmach, "R")) { - rmach = rnd; - } else if (lsame_(cmach, "M")) { - rmach = emin; - } else if (lsame_(cmach, "U")) { - rmach = rmin; - } else if (lsame_(cmach, "L")) { - rmach = emax; - } else if (lsame_(cmach, "O")) { - rmach = rmax; - } - - ret_val = rmach; - return ret_val; - -/* End of DLAMCH */ - -} /* dlamch_ */ - - -/* Subroutine */ int dlamc1_(int *beta, int *t, int *rnd, int - *ieee1) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC1 determines the machine parameters given by BETA, T, RND, and - IEEE1. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - IEEE1 (output) INT - Specifies whether rounding appears to be done in the IEEE - 'round to nearest' style. - - Further Details - =============== - - The routine is based on the routine ENVRON by Malcolm and - incorporates suggestions by Gentleman and Marovich. See - - Malcolm M. A. (1972) Algorithms to reveal properties of - floating-point arithmetic. Comms. of the ACM, 15, 949-951. - - Gentleman W. M. and Marovich S. B. (1974) More on algorithms - that reveal properties of floating point arithmetic units. - Comms. of the ACM, 17, 276-277. - - ===================================================================== -*/ - /* Initialized data */ - static int first = TRUE_; - /* System generated locals */ - double d__1, d__2; - /* Local variables */ - static int lrnd; - static double a, b, c, f; - static int lbeta; - static double savec; - extern double dlamc3_(double *, double *); - static int lieee1; - static double t1, t2; - static int lt; - static double one, qtr; - - if (first) { - first = FALSE_; - one = 1.; - -/* LBETA, LIEEE1, LT and LRND are the local values of BE -TA, - IEEE1, T and RND. - - Throughout this routine we use the function DLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - Compute a = 2.0**m with the smallest positive integer m s -uch - that - - fl( a + 1.0 ) = a. */ - - a = 1.; - c = 1.; - -/* + WHILE( C.EQ.ONE )LOOP */ -L10: - if (c == one) { - a *= 2; - c = dlamc3_(&a, &one); - d__1 = -a; - c = dlamc3_(&c, &d__1); - goto L10; - } -/* + END WHILE - - Now compute b = 2.0**m with the smallest positive integer -m - such that - - fl( a + b ) .gt. a. */ - - b = 1.; - c = dlamc3_(&a, &b); - -/* + WHILE( C.EQ.A )LOOP */ -L20: - if (c == a) { - b *= 2; - c = dlamc3_(&a, &b); - goto L20; - } -/* + END WHILE - - Now compute the base. a and c are neighbouring floating po -int - numbers in the interval ( beta**t, beta**( t + 1 ) ) and - so - their difference is beta. Adding 0.25 to c is to ensure that - it - is truncated to beta and not ( beta - 1 ). */ - - qtr = one / 4; - savec = c; - d__1 = -a; - c = dlamc3_(&c, &d__1); - lbeta = (int) (c + qtr); - -/* Now determine whether rounding or chopping occurs, by addin -g a - bit less than beta/2 and a bit more than beta/2 to - a. */ - - b = (double) lbeta; - d__1 = b / 2; - d__2 = -b / 100; - f = dlamc3_(&d__1, &d__2); - c = dlamc3_(&f, &a); - if (c == a) { - lrnd = TRUE_; - } else { - lrnd = FALSE_; - } - d__1 = b / 2; - d__2 = b / 100; - f = dlamc3_(&d__1, &d__2); - c = dlamc3_(&f, &a); - if (lrnd && c == a) { - lrnd = FALSE_; - } - -/* Try and decide whether rounding is done in the IEEE 'round - to - nearest' style. B/2 is half a unit in the last place of the -two - numbers A and SAVEC. Furthermore, A is even, i.e. has last -bit - zero, and SAVEC is odd. Thus adding B/2 to A should not cha -nge - A, but adding B/2 to SAVEC should change SAVEC. */ - - d__1 = b / 2; - t1 = dlamc3_(&d__1, &a); - d__1 = b / 2; - t2 = dlamc3_(&d__1, &savec); - lieee1 = t1 == a && t2 > savec && lrnd; - -/* Now find the mantissa, t. It should be the integer part - of - log to the base beta of a, however it is safer to determine - t - by powering. So we find t as the smallest positive integer -for - which - - fl( beta**t + 1.0 ) = 1.0. */ - - lt = 0; - a = 1.; - c = 1.; - -/* + WHILE( C.EQ.ONE )LOOP */ -L30: - if (c == one) { - ++lt; - a *= lbeta; - c = dlamc3_(&a, &one); - d__1 = -a; - c = dlamc3_(&c, &d__1); - goto L30; - } -/* + END WHILE */ - - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *ieee1 = lieee1; - return 0; - -/* End of DLAMC1 */ - -} /* dlamc1_ */ - - -/* Subroutine */ int dlamc2_(int *beta, int *t, int *rnd, - double *eps, int *emin, double *rmin, int *emax, - double *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC2 determines the machine parameters specified in its argument - list. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - EPS (output) DOUBLE PRECISION - The smallest positive number such that - - fl( 1.0 - EPS ) .LT. 1.0, - - where fl denotes the computed value. - - EMIN (output) INT - The minimum exponent before (gradual) underflow occurs. - - RMIN (output) DOUBLE PRECISION - The smallest normalized number for the machine, given by - BASE**( EMIN - 1 ), where BASE is the floating point value - - of BETA. - - EMAX (output) INT - The maximum exponent before overflow occurs. - - RMAX (output) DOUBLE PRECISION - The largest positive number for the machine, given by - BASE**EMAX * ( 1 - EPS ), where BASE is the floating point - - value of BETA. - - Further Details - =============== - - The computation of EPS is based on a routine PARANOIA by - W. Kahan of the University of California at Berkeley. - - ===================================================================== -*/ - /* Table of constant values */ - static int c__1 = 1; - - /* Initialized data */ - static int first = TRUE_; - static int iwarn = FALSE_; - /* System generated locals */ - int i__1; - double d__1, d__2, d__3, d__4, d__5; - /* Builtin functions */ - double pow_di(double *, int *); - /* Local variables */ - static int ieee; - static double half; - static int lrnd; - static double leps, zero, a, b, c; - static int i, lbeta; - static double rbase; - static int lemin, lemax, gnmin; - static double small; - static int gpmin; - static double third, lrmin, lrmax, sixth; - extern /* Subroutine */ int dlamc1_(int *, int *, int *, - int *); - extern double dlamc3_(double *, double *); - static int lieee1; - extern /* Subroutine */ int dlamc4_(int *, double *, int *), - dlamc5_(int *, int *, int *, int *, int *, - double *); - static int lt, ngnmin, ngpmin; - static double one, two; - - if (first) { - first = FALSE_; - zero = 0.; - one = 1.; - two = 2.; - -/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values - of - BETA, T, RND, EPS, EMIN and RMIN. - - Throughout this routine we use the function DLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. -*/ - - dlamc1_(&lbeta, <, &lrnd, &lieee1); - -/* Start to find EPS. */ - - b = (double) lbeta; - i__1 = -lt; - a = pow_di(&b, &i__1); - leps = a; - -/* Try some tricks to see whether or not this is the correct E -PS. */ - - b = two / 3; - half = one / 2; - d__1 = -half; - sixth = dlamc3_(&b, &d__1); - third = dlamc3_(&sixth, &sixth); - d__1 = -half; - b = dlamc3_(&third, &d__1); - b = dlamc3_(&b, &sixth); - b = abs(b); - if (b < leps) { - b = leps; - } - - leps = 1.; - -/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ -L10: - if (leps > b && b > zero) { - leps = b; - d__1 = half * leps; -/* Computing 5th power */ - d__3 = two, d__4 = d__3, d__3 *= d__3; -/* Computing 2nd power */ - d__5 = leps; - d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5); - c = dlamc3_(&d__1, &d__2); - d__1 = -c; - c = dlamc3_(&half, &d__1); - b = dlamc3_(&half, &c); - d__1 = -b; - c = dlamc3_(&half, &d__1); - b = dlamc3_(&half, &c); - goto L10; - } -/* + END WHILE */ - - if (a < leps) { - leps = a; - } - -/* Computation of EPS complete. - - Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 -)). - Keep dividing A by BETA until (gradual) underflow occurs. T -his - is detected when we cannot recover the previous A. */ - - rbase = one / lbeta; - small = one; - for (i = 1; i <= 3; ++i) { - d__1 = small * rbase; - small = dlamc3_(&d__1, &zero); -/* L20: */ - } - a = dlamc3_(&one, &small); - dlamc4_(&ngpmin, &one, &lbeta); - d__1 = -one; - dlamc4_(&ngnmin, &d__1, &lbeta); - dlamc4_(&gpmin, &a, &lbeta); - d__1 = -a; - dlamc4_(&gnmin, &d__1, &lbeta); - ieee = FALSE_; - - if (ngpmin == ngnmin && gpmin == gnmin) { - if (ngpmin == gpmin) { - lemin = ngpmin; -/* ( Non twos-complement machines, no gradual under -flow; - e.g., VAX ) */ - } else if (gpmin - ngpmin == 3) { - lemin = ngpmin - 1 + lt; - ieee = TRUE_; -/* ( Non twos-complement machines, with gradual und -erflow; - e.g., IEEE standard followers ) */ - } else { - lemin = min(ngpmin,gpmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if (ngpmin == gpmin && ngnmin == gnmin) { - if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { - lemin = max(ngpmin,ngnmin); -/* ( Twos-complement machines, no gradual underflow -; - e.g., CYBER 205 ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) - { - if (gpmin - min(ngpmin,ngnmin) == 3) { - lemin = max(ngpmin,ngnmin) - 1 + lt; -/* ( Twos-complement machines with gradual underflo -w; - no known machine ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else { -/* Computing MIN */ - i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); - lemin = min(i__1,gnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } -/* ** - Comment out this if block if EMIN is ok */ - if (iwarn) { - first = TRUE_; - printf("\n\n WARNING. The value EMIN may be incorrect:- "); - printf("EMIN = %8i\n",lemin); - printf("If, after inspection, the value EMIN looks acceptable"); - printf("please comment out \n the IF block as marked within the"); - printf("code of routine DLAMC2, \n otherwise supply EMIN"); - printf("explicitly.\n"); - } -/* ** - - Assume IEEE arithmetic if we found denormalised numbers abo -ve, - or if arithmetic seems to round in the IEEE style, determi -ned - in routine DLAMC1. A true IEEE machine should have both thi -ngs - true; however, faulty machines may have one or the other. */ - - ieee = ieee || lieee1; - -/* Compute RMIN by successive division by BETA. We could comp -ute - RMIN as BASE**( EMIN - 1 ), but some machines underflow dur -ing - this computation. */ - - lrmin = 1.; - i__1 = 1 - lemin; - for (i = 1; i <= 1-lemin; ++i) { - d__1 = lrmin * rbase; - lrmin = dlamc3_(&d__1, &zero); -/* L30: */ - } - -/* Finally, call DLAMC5 to compute EMAX and RMAX. */ - - dlamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *eps = leps; - *emin = lemin; - *rmin = lrmin; - *emax = lemax; - *rmax = lrmax; - - return 0; - - -/* End of DLAMC2 */ - -} /* dlamc2_ */ - - -double dlamc3_(double *a, double *b) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC3 is intended to force A and B to be stored prior to doing - - the addition of A and B , for use in situations where optimizers - - might hold one of these in a register. - - Arguments - ========= - - A, B (input) DOUBLE PRECISION - The values A and B. - - ===================================================================== -*/ -/* >>Start of File<< - System generated locals */ - double ret_val; - - ret_val = *a + *b; - - return ret_val; - -/* End of DLAMC3 */ - -} /* dlamc3_ */ - - -/* Subroutine */ int dlamc4_(int *emin, double *start, int *base) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC4 is a service routine for DLAMC2. - - Arguments - ========= - - EMIN (output) EMIN - The minimum exponent before (gradual) underflow, computed by - - setting A = START and dividing by BASE until the previous A - can not be recovered. - - START (input) DOUBLE PRECISION - The starting point for determining EMIN. - - BASE (input) INT - The base of the machine. - - ===================================================================== -*/ - /* System generated locals */ - int i__1; - double d__1; - /* Local variables */ - static double zero, a; - static int i; - static double rbase, b1, b2, c1, c2, d1, d2; - extern double dlamc3_(double *, double *); - static double one; - - a = *start; - one = 1.; - rbase = one / *base; - zero = 0.; - *emin = 1; - d__1 = a * rbase; - b1 = dlamc3_(&d__1, &zero); - c1 = a; - c2 = a; - d1 = a; - d2 = a; -/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. - $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ -L10: - if (c1 == a && c2 == a && d1 == a && d2 == a) { - --(*emin); - a = b1; - d__1 = a / *base; - b1 = dlamc3_(&d__1, &zero); - d__1 = b1 * *base; - c1 = dlamc3_(&d__1, &zero); - d1 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d1 += b1; -/* L20: */ - } - d__1 = a * rbase; - b2 = dlamc3_(&d__1, &zero); - d__1 = b2 / rbase; - c2 = dlamc3_(&d__1, &zero); - d2 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d2 += b2; -/* L30: */ - } - goto L10; - } -/* + END WHILE */ - - return 0; - -/* End of DLAMC4 */ - -} /* dlamc4_ */ - - -/* Subroutine */ int dlamc5_(int *beta, int *p, int *emin, - int *ieee, int *emax, double *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - DLAMC5 attempts to compute RMAX, the largest machine floating-point - number, without overflow. It assumes that EMAX + abs(EMIN) sum - approximately to a power of 2. It will fail on machines where this - assumption does not hold, for example, the Cyber 205 (EMIN = -28625, - - EMAX = 28718). It will also fail if the value supplied for EMIN is - too large (i.e. too close to zero), probably with overflow. - - Arguments - ========= - - BETA (input) INT - The base of floating-point arithmetic. - - P (input) INT - The number of base BETA digits in the mantissa of a - floating-point value. - - EMIN (input) INT - The minimum exponent before (gradual) underflow. - - IEEE (input) INT - A int flag specifying whether or not the arithmetic - system is thought to comply with the IEEE standard. - - EMAX (output) INT - The largest exponent before overflow - - RMAX (output) DOUBLE PRECISION - The largest machine floating-point number. - - ===================================================================== - - - - First compute LEXP and UEXP, two powers of 2 that bound - abs(EMIN). We then assume that EMAX + abs(EMIN) will sum - approximately to the bound that is closest to abs(EMIN). - (EMAX is the exponent of the required number RMAX). */ - /* Table of constant values */ - static double c_b5 = 0.; - - /* System generated locals */ - int i__1; - double d__1; - /* Local variables */ - static int lexp; - static double oldy; - static int uexp, i; - static double y, z; - static int nbits; - extern double dlamc3_(double *, double *); - static double recbas; - static int exbits, expsum, try__; - - - - lexp = 1; - exbits = 1; -L10: - try__ = lexp << 1; - if (try__ <= -(*emin)) { - lexp = try__; - ++exbits; - goto L10; - } - if (lexp == -(*emin)) { - uexp = lexp; - } else { - uexp = try__; - ++exbits; - } - -/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater - than or equal to EMIN. EXBITS is the number of bits needed to - store the exponent. */ - - if (uexp + *emin > -lexp - *emin) { - expsum = lexp << 1; - } else { - expsum = uexp << 1; - } - -/* EXPSUM is the exponent range, approximately equal to - EMAX - EMIN + 1 . */ - - *emax = expsum + *emin - 1; - nbits = exbits + 1 + *p; - -/* NBITS is the total number of bits needed to store a - floating-point number. */ - - if (nbits % 2 == 1 && *beta == 2) { - -/* Either there are an odd number of bits used to store a - floating-point number, which is unlikely, or some bits are - - not used in the representation of numbers, which is possible -, - (e.g. Cray machines) or the mantissa has an implicit bit, - (e.g. IEEE machines, Dec Vax machines), which is perhaps the - - most likely. We have to assume the last alternative. - If this is true, then we need to reduce EMAX by one because - - there must be some way of representing zero in an implicit-b -it - system. On machines like Cray, we are reducing EMAX by one - - unnecessarily. */ - - --(*emax); - } - - if (*ieee) { - -/* Assume we are on an IEEE machine which reserves one exponent - - for infinity and NaN. */ - - --(*emax); - } - -/* Now create RMAX, the largest machine number, which should - be equal to (1.0 - BETA**(-P)) * BETA**EMAX . - - First compute 1.0 - BETA**(-P), being careful that the - result is less than 1.0 . */ - - recbas = 1. / *beta; - z = *beta - 1.; - y = 0.; - i__1 = *p; - for (i = 1; i <= *p; ++i) { - z *= recbas; - if (y < 1.) { - oldy = y; - } - y = dlamc3_(&y, &z); -/* L20: */ - } - if (y >= 1.) { - y = oldy; - } - -/* Now multiply by BETA**EMAX to get RMAX. */ - - i__1 = *emax; - for (i = 1; i <= *emax; ++i) { - d__1 = y * *beta; - y = dlamc3_(&d__1, &c_b5); -/* L30: */ - } - - *rmax = y; - return 0; - -/* End of DLAMC5 */ - -} /* dlamc5_ */ - -double pow_di(double *ap, int *bp) -{ - double pow, x; - int n; - - pow = 1; - x = *ap; - n = *bp; - - if(n != 0){ - if(n < 0) { - n = -n; - x = 1/x; - } - for( ; ; ) { - if(n & 01) pow *= x; - if(n >>= 1) x *= x; - else break; - } - } - return(pow); -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlangs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,115 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: dlangs.c - * History: Modified from lapack routine DLANGE - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include "slu_ddefs.h" - -double dlangs(char *norm, SuperMatrix *A) -{ -/* - Purpose - ======= - - DLANGS returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - DLANGE returns the value - - DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in DLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - double *Aval; - int i, j, irow; - double value=0.0, sum; - double *rwork; - - Astore = (NCformat*) A->Store; - Aval = (double*)Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - - } else if (superlu_lsame(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - value = SUPERLU_MAX( value, fabs( Aval[i]) ); - - } else if (superlu_lsame(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += fabs(Aval[i]); - value = SUPERLU_MAX(value,sum); - } - - } else if (superlu_lsame(norm, "I")) { - /* Find normI(A). */ - if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for rwork."); - for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { - irow = Astore->rowind[i]; - rwork[irow] += fabs(Aval[i]); - } - value = 0.; - for (i = 0; i < A->nrow; ++i) - value = SUPERLU_MAX(value, rwork[i]); - - SUPERLU_FREE (rwork); - - } else if (superlu_lsame(norm, "F") || superlu_lsame(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else - ABORT("Illegal norm specified."); - - return (value); - -} /* dlangs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlaqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlaqgs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,144 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: dlaqgs.c - * History: Modified from LAPACK routine DLAQGE - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include "slu_ddefs.h" -#ifdef HYPRE_USING_HYPRE_LAPACK -#include "hypre_lapack.h" -#endif - -void -dlaqgs(SuperMatrix *A, double *r, double *c, - double rowcnd, double colcnd, double amax, char *equed) -{ -/* - Purpose - ======= - - DLAQGS equilibrates a general sparse M by N matrix A using the row and - scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = NC; Dtype = SLU_D; Mtype = GE. - - R (input) double*, dimension (A->nrow) - The row scale factors for A. - - C (input) double*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) double - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) double - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) double - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NCformat *Astore; - double *Aval; - int i, j, irow; - double large, small, cj; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *); - - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = (NCformat*) A->Store; - Aval = (double*) Astore->nzval; - - /* Initialize LARGE and SMALL. */ - small = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum") / hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - Aval[i] *= cj; - } - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - Aval[i] *= r[irow]; - } - *(unsigned char *)equed = 'R'; - } else { - /* Row and column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - Aval[i] *= cj * r[irow]; - } - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* dlaqgs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmemory.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmemory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmemory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,690 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_ddefs.h" - -/* Constants */ -#define NO_MEMTYPE 4 /* 0: lusup; - 1: ucol; - 2: lsub; - 3: usub */ -#define GluIntArray(n) (5 * (n) + 5) - -/* Internal prototypes */ -void *dexpand (int *, MemType,int, int, GlobalLU_t *); -int dLUWorkInit (int, int, int, int **, double **, LU_space_t); -void copy_mem_double (int, void *, void *); -void dStackCompress (GlobalLU_t *); -void dSetupSpace (void *, int, LU_space_t *); -void *duser_malloc (int, int); -void duser_free (int, int); - -/* External prototypes (in memory.c - prec-indep) */ -extern void copy_mem_int (int, void *, void *); -extern void user_bcopy (char *, char *, int); - -/* Headers for 4 types of dynamatically managed memory */ -typedef struct e_node { - int size; /* length of the memory that has been used */ - void *mem; /* pointer to the new malloc'd store */ -} ExpHeader; - -typedef struct { - int size; - int used; - int top1; /* grow upward, relative to &array[0] */ - int top2; /* grow downward */ - void *array; -} LU_stack_t; - -/* Variables local to this file */ -static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */ -static LU_stack_t stack; -static int no_expand; - -/* Macros to manipulate stack */ -#define StackFull(x) ( x + stack.used >= stack.size ) -#define NotDoubleAlign(addr) ( (long int)addr & 7 ) -#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) -#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ - (w + 1) * m * sizeof(double) ) -#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ - - - - -/* - * Setup the memory model to be used for factorization. - * lwork = 0: use system malloc; - * lwork > 0: use user-supplied work[] space. - */ -void dSetupSpace(void *work, int lwork, LU_space_t *MemModel) -{ - if ( lwork == 0 ) { - *MemModel = SYSTEM; /* malloc/free */ - } else if ( lwork > 0 ) { - *MemModel = USER; /* user provided space */ - stack.used = 0; - stack.top1 = 0; - stack.top2 = (lwork/4)*4; /* must be word addressable */ - stack.size = stack.top2; - stack.array = (void *) work; - } -} - - - -void *duser_malloc(int bytes, int which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - - -void duser_free(int bytes, int which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int dQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) -{ - SCformat *Lstore; - NCformat *Ustore; - register int n, iword, dword, panel_size = sp_ienv(1); - - Lstore = (SCformat*) L->Store; - Ustore = (NCformat*) U->Store; - n = L->ncol; - iword = sizeof(int); - dword = sizeof(double); - - /* For LU factors */ - mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] * - dword + Lstore->rowind_colptr[n] * iword ); - mem_usage->for_lu += (float)( (n + 1) * iword + - Ustore->colptr[n] * (dword + iword) ); - - /* Working storage to support factorization */ - mem_usage->total_needed = mem_usage->for_lu + - (float)( (2 * panel_size + 4 + NO_MARKER) * n * iword + - (panel_size + 1) * n * dword ); - - mem_usage->expansions = --no_expand; - - return 0; -} /* dQuerySpace */ - -/* - * Allocate storage for the data structures common to all factor routines. - * For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * If lwork = -1, return the estimated amount of space required, plus n; - * otherwise, return the amount of space actually allocated when - * memory allocation failure occurred. - */ -int -dLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, - int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, - int **iwork, double **dwork) -{ - int info, iword, dword; - SCformat *Lstore; - NCformat *Ustore; - int *xsup, *supno; - int *lsub, *xlsub; - double *lusup; - int *xlusup; - double *ucol; - int *usub, *xusub; - int nzlmax, nzumax, nzlumax; - int FILL = sp_ienv(6); - - Glu->n = n; - no_expand = 0; - iword = sizeof(int); - dword = sizeof(double); - - if ( !expanders ) - expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader)); - if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); - - if ( fact != SamePattern_SameRowPerm ) { - /* Guess for L\U factors */ - nzumax = nzlumax = FILL * annz; - nzlmax = SUPERLU_MAX(1, FILL/4.) * annz; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else { - dSetupSpace(work, lwork, &Glu->MemModel); - } - -#if ( PRNTlevel >= 1 ) - printf("dLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", - FILL, nzlmax, nzumax); - fflush(stdout); -#endif - - /* Integer pointers for L\U factors */ - if ( Glu->MemModel == SYSTEM ) { - xsup = intMalloc(n+1); - supno = intMalloc(n+1); - xlsub = intMalloc(n+1); - xlusup = intMalloc(n+1); - xusub = intMalloc(n+1); - } else { - xsup = (int *)duser_malloc((n+1) * iword, HEAD); - supno = (int *)duser_malloc((n+1) * iword, HEAD); - xlsub = (int *)duser_malloc((n+1) * iword, HEAD); - xlusup = (int *)duser_malloc((n+1) * iword, HEAD); - xusub = (int *)duser_malloc((n+1) * iword, HEAD); - } - - lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) dexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) dexpand( &nzumax, USUB, 0, 1, Glu ); - - while ( !lusup || !ucol || !lsub || !usub ) { - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE(lusup); - SUPERLU_FREE(ucol); - SUPERLU_FREE(lsub); - SUPERLU_FREE(usub); - } else { - duser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD); - } - nzlumax /= 2; - nzumax /= 2; - nzlmax /= 2; - if ( nzlumax < annz ) { - printf("Not enough memory to perform factorization.\n"); - return (dmemory_usage(nzlmax, nzumax, nzlumax, n) + n); - } -#if ( PRNTlevel >= 1) - printf("dLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", - nzlmax, nzumax); - fflush(stdout); -#endif - lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) dexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) dexpand( &nzumax, USUB, 0, 1, Glu ); - } - - } else { - /* fact == SamePattern_SameRowPerm */ - Lstore = (SCformat*) L->Store; - Ustore = (NCformat*) U->Store; - xsup = Lstore->sup_to_col; - supno = Lstore->col_to_sup; - xlsub = Lstore->rowind_colptr; - xlusup = Lstore->nzval_colptr; - xusub = Ustore->colptr; - nzlmax = Glu->nzlmax; /* max from previous factorization */ - nzumax = Glu->nzumax; - nzlumax = Glu->nzlumax; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else if ( lwork == 0 ) { - Glu->MemModel = SYSTEM; - } else { - Glu->MemModel = USER; - stack.top2 = (lwork/4)*4; /* must be word-addressable */ - stack.size = stack.top2; - } - - lsub = (int*) Lstore->rowind; - expanders[LSUB].mem = (int*) Lstore->rowind; - lusup = (double*) Lstore->nzval; - expanders[LUSUP].mem = (double*) Lstore->nzval; - usub = (int*) Ustore->rowind; - expanders[USUB].mem = (int*) Ustore->rowind; - ucol = (double*) Ustore->nzval; - expanders[UCOL].mem = (double*) Ustore->nzval;; - expanders[LSUB].size = nzlmax; - expanders[LUSUP].size = nzlumax; - expanders[USUB].size = nzumax; - expanders[UCOL].size = nzumax; - } - - Glu->xsup = xsup; - Glu->supno = supno; - Glu->lsub = lsub; - Glu->xlsub = xlsub; - Glu->lusup = lusup; - Glu->xlusup = xlusup; - Glu->ucol = ucol; - Glu->usub = usub; - Glu->xusub = xusub; - Glu->nzlmax = nzlmax; - Glu->nzumax = nzumax; - Glu->nzlumax = nzlumax; - - info = dLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel); - if ( info ) - return ( info + dmemory_usage(nzlmax, nzumax, nzlumax, n) + n); - - ++no_expand; - return 0; - -} /* dLUMemInit */ - -/* Allocate known working storage. Returns 0 if success, otherwise - returns the number of bytes allocated so far when failure occurred. */ -int -dLUWorkInit(int m, int n, int panel_size, int **iworkptr, - double **dworkptr, LU_space_t MemModel) -{ - int isize, dsize, extra; - double *old_ptr; - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - - isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); - dsize = (m * panel_size + - NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(double); - - if ( MemModel == SYSTEM ) - *iworkptr = (int *) intCalloc(isize/sizeof(int)); - else - *iworkptr = (int *) duser_malloc(isize, TAIL); - if ( ! *iworkptr ) { - fprintf(stderr, "dLUWorkInit: malloc fails for local iworkptr[]\n"); - return (isize + n); - } - - if ( MemModel == SYSTEM ) - *dworkptr = (double *) SUPERLU_MALLOC(dsize); - else { - *dworkptr = (double *) duser_malloc(dsize, TAIL); - if ( NotDoubleAlign(*dworkptr) ) { - old_ptr = *dworkptr; - *dworkptr = (double*) DoubleAlign(*dworkptr); - *dworkptr = (double*) ((double*)*dworkptr - 1); - extra = (char*)old_ptr - (char*)*dworkptr; -#ifdef DEBUG - printf("dLUWorkInit: not aligned, extra %d\n", extra); -#endif - stack.top2 -= extra; - stack.used += extra; - } - } - if ( ! *dworkptr ) { - fprintf(stderr, "malloc fails for local dworkptr[]."); - return (isize + dsize + n); - } - - return 0; -} - - -/* - * Set up pointers for real working arrays. - */ -void -dSetRWork(int m, int panel_size, double *dworkptr, - double **dense, double **tempv) -{ - double zero = 0.0; - - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - *dense = dworkptr; - *tempv = *dense + panel_size*m; - dfill (*dense, m * panel_size, zero); - dfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); -} - -/* - * Free the working storage used by factor routines. - */ -void dLUWorkFree(int *iwork, double *dwork, GlobalLU_t *Glu) -{ - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE (iwork); - SUPERLU_FREE (dwork); - } else { - stack.used -= (stack.size - stack.top2); - stack.top2 = stack.size; -/* dStackCompress(Glu); */ - } - - SUPERLU_FREE (expanders); - expanders = 0; -} - -/* Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -dLUMemXpand(int jcol, - int next, /* number of elements currently in the factors */ - MemType mem_type, /* which type of memory to expand */ - int *maxlen, /* modified - maximum length of a data structure */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - void *new_mem; - -#ifdef DEBUG - printf("dLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", - jcol, next, *maxlen, mem_type); -#endif - - if (mem_type == USUB) - new_mem = dexpand(maxlen, mem_type, next, 1, Glu); - else - new_mem = dexpand(maxlen, mem_type, next, 0, Glu); - - if ( !new_mem ) { - int nzlmax = Glu->nzlmax; - int nzumax = Glu->nzumax; - int nzlumax = Glu->nzlumax; - fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); - return (dmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); - } - - switch ( mem_type ) { - case LUSUP: - Glu->lusup = (double *) new_mem; - Glu->nzlumax = *maxlen; - break; - case UCOL: - Glu->ucol = (double *) new_mem; - Glu->nzumax = *maxlen; - break; - case LSUB: - Glu->lsub = (int *) new_mem; - Glu->nzlmax = *maxlen; - break; - case USUB: - Glu->usub = (int *) new_mem; - Glu->nzumax = *maxlen; - break; - } - - return 0; - -} - - - -void -copy_mem_double(int howmany, void *oldV, void *newV) -{ - register int i; - double *dold = (double *)oldV; - double *dnew = (double *)newV; - for (i = 0; i < howmany; i++) dnew[i] = dold[i]; -} - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -void -*dexpand ( - int *prev_len, /* length used from previous call */ - MemType type, /* which part of the memory to expand */ - int len_to_copy, /* size of the memory to be copied to new store */ - int keep_prev, /* = 1: use prev_len; - = 0: compute new_len to expand */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - float EXPAND = 1.5; - float alpha; - void *new_mem, *old_mem; - int new_len, tries, lword, extra, bytes_to_copy; - - alpha = EXPAND; - - if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ - new_len = *prev_len; - else { - new_len = alpha * *prev_len; - } - - if ( type == LSUB || type == USUB ) lword = sizeof(int); - else lword = sizeof(double); - - if ( Glu->MemModel == SYSTEM ) { - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - if ( no_expand != 0 ) { - tries = 0; - if ( keep_prev ) { - if ( !new_mem ) return (NULL); - } else { - while ( !new_mem ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - } - } - if ( type == LSUB || type == USUB ) { - copy_mem_int(len_to_copy, expanders[type].mem, new_mem); - } else { - copy_mem_double(len_to_copy, expanders[type].mem, new_mem); - } - SUPERLU_FREE (expanders[type].mem); - } - expanders[type].mem = (void *) new_mem; - - } else { /* MemModel == USER */ - if ( no_expand == 0 ) { - new_mem = duser_malloc(new_len * lword, HEAD); - if ( NotDoubleAlign(new_mem) && - (type == LUSUP || type == UCOL) ) { - old_mem = new_mem; - new_mem = (void *)DoubleAlign(new_mem); - extra = (char*)new_mem - (char*)old_mem; -#ifdef DEBUG - printf("expand(): not aligned, extra %d\n", extra); -#endif - stack.top1 += extra; - stack.used += extra; - } - expanders[type].mem = (void *) new_mem; - } - else { - tries = 0; - extra = (new_len - *prev_len) * lword; - if ( keep_prev ) { - if ( StackFull(extra) ) return (NULL); - } else { - while ( StackFull(extra) ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - extra = (new_len - *prev_len) * lword; - } - } - - if ( type != USUB ) { - new_mem = (void*)((char*)expanders[type + 1].mem + extra); - bytes_to_copy = (char*)stack.array + stack.top1 - - (char*)expanders[type + 1].mem; - user_bcopy((char*)expanders[type+1].mem,(char*) new_mem, bytes_to_copy); - - if ( type < USUB ) { - Glu->usub = - (int*)((char*)expanders[USUB].mem + extra); - expanders[USUB].mem = - (int*)((char*)expanders[USUB].mem + extra); - } - if ( type < LSUB ) { - Glu->lsub = - (int*)((char*)expanders[LSUB].mem + extra); - expanders[LSUB].mem = - (int*)((char*)expanders[LSUB].mem + extra); - } - if ( type < UCOL ) { - Glu->ucol = - (double*)((char*)expanders[UCOL].mem + extra); - expanders[UCOL].mem = - (double*)((char*)expanders[UCOL].mem + extra); - } - stack.top1 += extra; - stack.used += extra; - if ( type == UCOL ) { - stack.top1 += extra; /* Add same amount for USUB */ - stack.used += extra; - } - - } /* if ... */ - - } /* else ... */ - } - - expanders[type].size = new_len; - *prev_len = new_len; - if ( no_expand ) ++no_expand; - - return (void *) expanders[type].mem; - -} /* dexpand */ - - -/* - * Compress the work[] array to remove fragmentation. - */ -void -dStackCompress(GlobalLU_t *Glu) -{ - register int iword, dword, ndim; - char *last, *fragment; - int *ifrom, *ito; - double *dfrom, *dto; - int *xlsub, *lsub, *xusub, *usub, *xlusup; - double *ucol, *lusup; - - iword = sizeof(int); - dword = sizeof(double); - ndim = Glu->n; - - xlsub = Glu->xlsub; - lsub = Glu->lsub; - xusub = Glu->xusub; - usub = Glu->usub; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - lusup = Glu->lusup; - - dfrom = ucol; - dto = (double *)((char*)lusup + xlusup[ndim] * dword); - copy_mem_double(xusub[ndim], dfrom, dto); - ucol = dto; - - ifrom = lsub; - ito = (int *) ((char*)ucol + xusub[ndim] * iword); - copy_mem_int(xlsub[ndim], ifrom, ito); - lsub = ito; - - ifrom = usub; - ito = (int *) ((char*)lsub + xlsub[ndim] * iword); - copy_mem_int(xusub[ndim], ifrom, ito); - usub = ito; - - last = (char*)usub + xusub[ndim] * iword; - fragment = (char*) (((char*)stack.array + stack.top1) - last); - stack.used -= (long int) fragment; - stack.top1 -= (long int) fragment; - - Glu->ucol = ucol; - Glu->lsub = lsub; - Glu->usub = usub; - -#ifdef DEBUG - printf("dStackCompress: fragment %d\n", fragment); - /* for (last = 0; last < ndim; ++last) - print_lu_col("After compress:", last, 0);*/ -#endif - -} - -/* - * Allocate storage for original matrix A - */ -void -dallocateA(int n, int nnz, double **a, int **asub, int **xa) -{ - *a = (double *) doubleMalloc(nnz); - *asub = (int *) intMalloc(nnz); - *xa = (int *) intMalloc(n+1); -} - - -double *doubleMalloc(int n) -{ - double *buf; - buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in doubleMalloc()\n"); - } - return (buf); -} - -double *doubleCalloc(int n) -{ - double *buf; - register int i; - double zero = 0.0; - buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in doubleCalloc()\n"); - } - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - - -int dmemory_usage(const int nzlmax, const int nzumax, - const int nzlumax, const int n) -{ - register int iword, dword; - - iword = sizeof(int); - dword = sizeof(double); - - return (10 * n * iword + - nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmyblas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmyblas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmyblas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,224 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: dmyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void dlsolve ( int ldm, int ncol, double *M, double *rhs ) -{ - int k; - double x0, x1, x2, x3, x4, x5, x6, x7; - double *M0; - register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - - M0 = &M[0]; - - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - Mki4 = Mki3 + ldm + 1; - Mki5 = Mki4 + ldm + 1; - Mki6 = Mki5 + ldm + 1; - Mki7 = Mki6 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++; - x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++; - x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; - x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - rhs[++firstcol] = x4; - rhs[++firstcol] = x5; - rhs[++firstcol] = x6; - rhs[++firstcol] = x7; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++ - - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++ - x7 * *Mki7++; - - M0 += 8 * ldm + 8; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++; - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; - - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -dusolve ( ldm, ncol, M, rhs ) -int ldm; /* in */ -int ncol; /* in */ -double *M; /* in */ -double *rhs; /* modified */ -{ - double xj; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) - rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void dmatvec ( ldm, nrow, ncol, M, vec, Mxvec ) - -int ldm; /* in -- leading dimension of M */ -int nrow; /* in */ -int ncol; /* in */ -double *M; /* in */ -double *vec; /* in */ -double *Mxvec; /* in/out */ - -{ - double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; - double *M0; - register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - int k; - - M0 = &M[0]; - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - Mki4 = Mki3 + ldm; - Mki5 = Mki4 + ldm; - Mki6 = Mki5 + ldm; - Mki7 = Mki6 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - vi4 = vec[firstcol++]; - vi5 = vec[firstcol++]; - vi6 = vec[firstcol++]; - vi7 = vec[firstcol++]; - - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ - + vi4 * *Mki4++ + vi5 * *Mki5++ - + vi6 * *Mki6++ + vi7 * *Mki7++; - - M0 += 8 * ldm; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ ; - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++; - - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,462 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include -#include "slu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif - -/* - * Function prototypes - */ -void sludlsolve(int, int, double *, double *); -void sludmatvec(int, int, int, double *, double *, double *); -extern void dcheck_tempv(); - -void -dpanel_bmod ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - const int nseg, /* in */ - double *dense, /* out, of size n by w */ - double *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in, of size n by w */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * - * Performs numeric block updates (sup-panel) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - * Before entering this routine, the original nonzeros in the panel - * were already copied into the spa[m,w]. - * - * Updated/Output parameters- - * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned - * collectively in the m-by-w vector dense[*]. - * - */ - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - double alpha, beta; -#endif - - register int k, ksub; - int fsupc, nsupc, nsupr, nrow; - int krep, krep_ind; - double ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int segsze; - int block_nrow; /* no of rows in a block row */ - register int lptr; /* Points to the row subscripts of a supernode */ - int kfnz, irow, no_zeros; - register int isub, isub1, i; - register int jj; /* Index through each column in the panel */ - int *xsup, *supno; - int *lsub, *xlsub; - double *lusup; - int *xlusup; - int *repfnz_col; /* repfnz[] for a column in the panel */ - double *dense_col; /* dense[] for a column in the panel */ - double *tempv1; /* Used in 1-D update */ - double *TriTmp, *MatvecTmp; /* used in 2-D update */ - double zero = 0.0; -#ifdef USE_VENDOR_BLAS - double one = 1.0; -#endif - register int ldaTmp; - register int r_ind, r_hi; - static int first = 1, maxsuper, rowblk, colblk; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - if ( first ) { - maxsuper = sp_ienv(3); - rowblk = sp_ienv(4); - colblk = sp_ienv(5); - first = 0; - } - ldaTmp = maxsuper + rowblk; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in a supernode - * nsupr = no of rows in a supernode - */ - krep = segrep[k--]; - fsupc = xsup[supno[krep]]; - nsupc = krep - fsupc + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nrow = nsupr - nsupc; - lptr = xlsub[fsupc]; - krep_ind = lptr + nsupc - 1; - - repfnz_col = repfnz; - dense_col = dense; - - if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ - - TriTmp = tempv; - - /* Sequence through each column in panel -- triangular solves */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += segsze * (segsze - 1); - ops[GEMV] += 2 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - dense_col[irow] -= ukj * lusup[luptr]; - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - ukj -= ukj1 * lusup[luptr1]; - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; - dense_col[irow] -= (ukj*lusup[luptr] - + ukj1*lusup[luptr1]); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - ukj1 -= ukj2 * lusup[luptr2-1]; - ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; luptr2++; - dense_col[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); - } - } - - } else { /* segsze >= 4 */ - - /* Copy U[*,j] segment from dense[*] to TriTmp[*], which - holds the result of triangular solves. */ - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - TriTmp[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&segsze, - &lusup[luptr], &nsupr, TriTmp, &incx ); -#endif -#else - sludlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); -#endif - - - } /* else ... */ - - } /* for jj ... end tri-solves */ - - /* Block row updates; push all the way into dense[*] block */ - for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { - - r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); - block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); - luptr = xlusup[fsupc] + nsupc + r_ind; - isub1 = lptr + nsupc + r_ind; - - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - /* Sequence through each column in panel -- matrix-vector */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - /* Perform a block update, and scatter the result of - matrix-vector to dense[]. */ - no_zeros = kfnz - fsupc; - luptr1 = luptr + nsupr * no_zeros; - MatvecTmp = &TriTmp[maxsuper]; - -#ifdef USE_VENDOR_BLAS - alpha = one; - beta = zero; -#ifdef _CRAY - SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#else - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N", &block_nrow, &segsze, - &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#endif -#else - sludmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], - TriTmp, MatvecTmp); -#endif - - /* Scatter MatvecTmp[*] into SPA dense[*] temporarily - * such that MatvecTmp[*] can be re-used for the - * the next blok row update. dense[] will be copied into - * global store after the whole panel has been finished. - */ - isub = isub1; - for (i = 0; i < block_nrow; i++) { - irow = lsub[isub]; - dense_col[irow] -= MatvecTmp[i]; - MatvecTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } /* for each block row ... */ - - /* Scatter the triangular solves into SPA dense[*] */ - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = TriTmp[i]; - TriTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } else { /* 1-D block modification */ - - - /* Sequence through each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += segsze * (segsze - 1); - ops[GEMV] += 2 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - dense_col[irow] -= ukj * lusup[luptr]; - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - ukj -= ukj1 * lusup[luptr1]; - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; - dense_col[irow] -= (ukj*lusup[luptr] - + ukj1*lusup[luptr1]); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - ukj1 -= ukj2 * lusup[luptr2-1]; - ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; ++luptr2; - dense_col[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); - } - } - - } else { /* segsze >= 4 */ - /* - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense[]. - */ - no_zeros = kfnz - fsupc; - - /* Copy U[*,j] segment from dense[*] to tempv[*]: - * The result of triangular solve is in tempv[*]; - * The result of matrix vector update is in dense_col[*] - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - tempv[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&segsze, - &lusup[luptr], &nsupr, tempv, &incx ); -#endif - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow,&segsze,&alpha, - &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - sludlsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - sludmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); -#endif - - /* Scatter tempv[*] into SPA dense[*] temporarily, such - * that tempv[*] can be used for the triangular solve of - * the next column of the panel. They will be copied into - * ucol[*] after the whole panel has been finished. - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = tempv[i]; - tempv[i] = zero; - isub++; - } - - /* Scatter the update from tempv1[*] into SPA dense[*] */ - /* Start dense rectangular L */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - dense_col[irow] -= tempv1[i]; - tempv1[i] = zero; - ++isub; - } - - } /* else segsze>=4 ... */ - - } /* for each column in the panel... */ - - } /* else 1-D update ... */ - - } /* for each updating supernode ... */ - -} - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,247 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -void -dpanel_dfs ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - SuperMatrix *A, /* in - original matrix */ - int *perm_r, /* in */ - int *nseg, /* out */ - double *dense, /* out */ - int *panel_lsub, /* out */ - int *segrep, /* out */ - int *repfnz, /* out */ - int *xprune, /* out */ - int *marker, /* out */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * - * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. - * - * The routine returns one list of the supernodal representatives - * in topological order of the dfs that generates them. This list is - * a superset of the topological order of each individual column within - * the panel. - * The location of the first nonzero in each supernodal segment - * (supernodal entry location) is also returned. Each column has a - * separate list for this purpose. - * - * Two marker arrays are used for dfs: - * marker[i] == jj, if i was visited during dfs of current column jj; - * marker1[i] >= jcol, if i was visited by earlier columns in this panel; - * - * marker: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - */ - NCPformat *Astore; - double *a; - int *asub; - int *xa_begin, *xa_end; - int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; - int k, krow, kmark, kperm; - int xdfs, maxdfs, kpar; - int jj; /* index through each column in the panel */ - int *marker1; /* marker1[jj] >= jcol if vertex jj was visited - by a previous column within this panel. */ - int *repfnz_col; /* start of each column in the panel */ - double *dense_col; /* start of each column in the panel */ - int nextl_col; /* next available position in panel_lsub[*,jj] */ - int *xsup, *supno; - int *lsub, *xlsub; - - /* Initialize pointers */ - Astore = (NCPformat*) A->Store; - a = ( double*) Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - marker1 = marker + m; - repfnz_col = repfnz; - dense_col = dense; - *nseg = 0; - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - - /* For each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++) { - nextl_col = (jj - jcol) * m; - -#ifdef CHK_DFS - printf("\npanel col %d: ", jj); -#endif - - /* For each nonz in A[*,jj] do dfs */ - for (k = xa_begin[jj]; k < xa_end[jj]; k++) { - krow = asub[k]; - dense_col[krow] = a[k]; - kmark = marker[krow]; - if ( kmark == jj ) - continue; /* krow visited before, go to the next nonzero */ - - /* For each unmarked nbr krow of jj - * krow is in L: place it in structure of L[*,jj] - */ - marker[krow] = jj; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ - } - /* - * krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - else { - - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz_col[krep]; - -#ifdef CHK_DFS - printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); -#endif - if ( myfnz != EMPTY ) { /* Representative visited before */ - if ( myfnz > kperm ) repfnz_col[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz_col[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker[kchild]; - - if ( chmark != jj ) { /* Not reached yet */ - marker[kchild] = jj; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,j] */ - if ( chperm == EMPTY ) { - panel_lsub[nextl_col++] = kchild; - } - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - else { - - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz_col[chrep]; -#ifdef CHK_DFS - printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); -#endif - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz_col[chrep] = chperm; - } - else { - /* Cont. dfs at snode-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L) */ - parent[krep] = oldrep; - repfnz_col[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } /* else */ - - } /* else */ - - } /* if... */ - - } /* while xdfs < maxdfs */ - - /* krow has no more unexplored nbrs: - * Place snode-rep krep in postorder DFS, if this - * segment is seen for the first time. (Note that - * "repfnz[krep]" may change later.) - * Backtrack dfs to its parent. - */ - if ( marker1[krep] < jcol ) { - segrep[*nseg] = krep; - ++(*nseg); - marker1[krep] = jj; - } - - kpar = parent[krep]; /* Pop stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } while ( kpar != EMPTY ); /* do-while - until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonz in A[*,jj] */ - - repfnz_col += m; /* Move to next column */ - dense_col += m; - - } /* for jj ... */ - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,115 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include "slu_ddefs.h" -#ifdef HYPRE_USING_HYPRE_LAPACK -#include "hypre_lapack.h" -#endif - -double -dPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, - SuperMatrix *L, SuperMatrix *U) -{ -/* - * Purpose - * ======= - * - * Compute the reciprocal pivot growth factor of the leading ncols columns - * of the matrix, using the formula: - * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) - * - * Arguments - * ========= - * - * ncols (input) int - * The number of columns of matrices A, L and U. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = NC; Dtype = SLU_D; Mtype = GE. - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SC; Dtype = SLU_D; Mtype = TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = NC; - * Dtype = SLU_D; Mtype = TRU. - * - */ - NCformat *Astore; - SCformat *Lstore; - NCformat *Ustore; - double *Aval, *Lval, *Uval; - int fsupc, nsupr, luptr, nz_in_U; - int i, j, k, oldcol; - int *inv_perm_c; - double rpg, maxaj, maxuj; - extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *); - double smlnum; - double *luval; - - /* Get machine constants. */ - smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("S"); - rpg = 1. / smlnum; - - Astore = (NCformat*) A->Store; - Lstore = (SCformat*) L->Store; - Ustore = (NCformat*) U->Store; - Aval = (double*) Astore->nzval; - Lval = (double*) Lstore->nzval; - Uval = (double*) Ustore->nzval; - - inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); - for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; - - for (k = 0; k <= Lstore->nsuper; ++k) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - luptr = L_NZ_START(fsupc); - luval = &Lval[luptr]; - nz_in_U = 1; - - for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { - maxaj = 0.; - oldcol = inv_perm_c[j]; - for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) - maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) ); - - maxuj = 0.; - for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) - maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) ); - - /* Supernode */ - for (i = 0; i < nz_in_U; ++i) - maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) ); - - ++nz_in_U; - luval += nsupr; - - if ( maxuj == 0. ) - rpg = SUPERLU_MIN( rpg, 1.); - else - rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); - } - - if ( j >= ncols ) break; - } - - SUPERLU_FREE(inv_perm_c); - return (rpg); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,173 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_ddefs.h" - -#undef DEBUG - -int -dpivotL( - const int jcol, /* in */ - const double u, /* in - diagonal pivoting threshold */ - int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ - int *perm_r, /* may be modified */ - int *iperm_r, /* in - inverse of perm_r */ - int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ - int *pivrow, /* out */ - GlobalLU_t *Glu, /* modified - global LU data structures */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * Performs the numerical pivoting on the current column of L, - * and the CDIV operation. - * - * Pivot policy: - * (1) Compute thresh = u * max_(i>=j) abs(A_ij); - * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN - * pivot row = k; - * ELSE IF abs(A_jj) >= thresh THEN - * pivot row = j; - * ELSE - * pivot row = m; - * - * Note: If you absolutely want to use a given pivot order, then set u=0.0. - * - * Return value: 0 success; - * i > 0 U(i,i) is exactly zero. - * - */ - int fsupc; /* first column in the supernode */ - int nsupc; /* no of columns in the supernode */ - int nsupr; /* no of rows in the supernode */ - int lptr; /* points to the starting subscript of the supernode */ - int pivptr, old_pivptr, diag, diagind; - double pivmax, rtemp, thresh; - double temp; - double *lu_sup_ptr; - double *lu_col_ptr; - int *lsub_ptr; - int isub, icol, k, itemp; - int *lsub, *xlsub; - double *lusup; - int *xlusup; - flops_t *ops = stat->ops; - - /* Initialize pointers */ - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; - nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ - lptr = xlsub[fsupc]; - nsupr = xlsub[fsupc+1] - lptr; - lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ - lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ - lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ - -#ifdef DEBUG -if ( jcol == MIN_COL ) { - printf("Before cdiv: col %d\n", jcol); - for (k = nsupc; k < nsupr; k++) - printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); -} -#endif - - /* Determine the largest abs numerical value for partial pivoting; - Also search for user-specified pivot, and diagonal element. */ - if ( *usepr ) *pivrow = iperm_r[jcol]; - diagind = iperm_c[jcol]; - pivmax = 0.0; - pivptr = nsupc; - diag = EMPTY; - old_pivptr = nsupc; - for (isub = nsupc; isub < nsupr; ++isub) { - rtemp = fabs (lu_col_ptr[isub]); - if ( rtemp > pivmax ) { - pivmax = rtemp; - pivptr = isub; - } - if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; - if ( lsub_ptr[isub] == diagind ) diag = isub; - } - - /* Test for singularity */ - if ( pivmax == 0.0 ) { - *pivrow = lsub_ptr[pivptr]; - perm_r[*pivrow] = jcol; - *usepr = 0; - return (jcol+1); - } - - thresh = u * pivmax; - - /* Choose appropriate pivotal element by our policy. */ - if ( *usepr ) { - rtemp = fabs (lu_col_ptr[old_pivptr]); - if ( rtemp != 0.0 && rtemp >= thresh ) - pivptr = old_pivptr; - else - *usepr = 0; - } - if ( *usepr == 0 ) { - /* Use diagonal pivot? */ - if ( diag >= 0 ) { /* diagonal exists */ - rtemp = fabs (lu_col_ptr[diag]); - if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; - } - *pivrow = lsub_ptr[pivptr]; - } - - /* Record pivot row */ - perm_r[*pivrow] = jcol; - - /* Interchange row subscripts */ - if ( pivptr != nsupc ) { - itemp = lsub_ptr[pivptr]; - lsub_ptr[pivptr] = lsub_ptr[nsupc]; - lsub_ptr[nsupc] = itemp; - - /* Interchange numerical values as well, for the whole snode, such - * that L is indexed the same way as A. - */ - for (icol = 0; icol <= nsupc; icol++) { - itemp = pivptr + icol * nsupr; - temp = lu_sup_ptr[itemp]; - lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; - lu_sup_ptr[nsupc + icol*nsupr] = temp; - } - } /* if */ - - /* cdiv operation */ - ops[FACT] += nsupr - nsupc; - - temp = 1.0 / lu_col_ptr[nsupc]; - for (k = nsupc+1; k < nsupr; k++) - lu_col_ptr[k] *= temp; - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpruneL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpruneL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpruneL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,147 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -void -dpruneL( - const int jcol, /* in */ - const int *perm_r, /* in */ - const int pivrow, /* in */ - const int nseg, /* in */ - const int *segrep, /* in */ - const int *repfnz, /* in */ - int *xprune, /* out */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ -/* - * Purpose - * ======= - * Prunes the L-structure of supernodes whose L-structure - * contains the current pivot row "pivrow" - * - */ - double utemp; - int jsupno, irep, irep1, kmin, kmax, krow, movnum; - int i, ktemp, minloc, maxloc; - int do_prune; /* logical variable */ - int *xsup, *supno; - int *lsub, *xlsub; - double *lusup; - int *xlusup; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - /* - * For each supernode-rep irep in U[*,j] - */ - jsupno = supno[jcol]; - for (i = 0; i < nseg; i++) { - - irep = segrep[i]; - irep1 = irep + 1; - do_prune = FALSE; - - /* Don't prune with a zero U-segment */ - if ( repfnz[irep] == EMPTY ) - continue; - - /* If a snode overlaps with the next panel, then the U-segment - * is fragmented into two parts -- irep and irep1. We should let - * pruning occur at the rep-column in irep1's snode. - */ - if ( supno[irep] == supno[irep1] ) /* Don't prune */ - continue; - - /* - * If it has not been pruned & it has a nonz in row L[pivrow,i] - */ - if ( supno[irep] != jsupno ) { - if ( xprune[irep] >= xlsub[irep1] ) { - kmin = xlsub[irep]; - kmax = xlsub[irep1] - 1; - for (krow = kmin; krow <= kmax; krow++) - if ( lsub[krow] == pivrow ) { - do_prune = TRUE; - break; - } - } - - if ( do_prune ) { - - /* Do a quicksort-type partition - * movnum=TRUE means that the num values have to be exchanged. - */ - movnum = FALSE; - if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ - movnum = TRUE; - - while ( kmin <= kmax ) { - - if ( perm_r[lsub[kmax]] == EMPTY ) - kmax--; - else if ( perm_r[lsub[kmin]] != EMPTY ) - kmin++; - else { /* kmin below pivrow, and kmax above pivrow: - * interchange the two subscripts - */ - ktemp = lsub[kmin]; - lsub[kmin] = lsub[kmax]; - lsub[kmax] = ktemp; - - /* If the supernode has only one column, then we - * only keep one set of subscripts. For any subscript - * interchange performed, similar interchange must be - * done on the numerical values. - */ - if ( movnum ) { - minloc = xlusup[irep] + (kmin - xlsub[irep]); - maxloc = xlusup[irep] + (kmax - xlsub[irep]); - utemp = lusup[minloc]; - lusup[minloc] = lusup[maxloc]; - lusup[maxloc] = utemp; - } - - kmin++; - kmax--; - - } - - } /* while */ - - xprune[irep] = kmin; /* Pruning */ - -#ifdef CHK_PRUNE - printf(" After dpruneL(),using col %d: xprune[%d] = %d\n", - jcol, irep, kmin); -#endif - } /* if do_prune */ - - } /* if */ - - } /* for each U-segment... */ -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dreadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dreadhb.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dreadhb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dreadhb.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,258 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include -#include "slu_ddefs.h" - - -/* Eat up the rest of the current line */ -int dDumpLine(FILE *fp) -{ - register int c; - while ((c = fgetc(fp)) != '\n') ; - return 0; -} - -int dParseIntFormat(char *buf, int *num, int *size) -{ - char *tmp; - - tmp = buf; - while (*tmp++ != '(') ; - sscanf(tmp, "%d", num); - while (*tmp != 'I' && *tmp != 'i') ++tmp; - ++tmp; - sscanf(tmp, "%d", size); - return 0; -} - -int dParseFloatFormat(char *buf, int *num, int *size) -{ - char *tmp, *period; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' - && *tmp != 'F' && *tmp != 'f') { - /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the - num picked up refers to P, which should be skipped. */ - if (*tmp=='p' || *tmp=='P') { - ++tmp; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - } else { - ++tmp; - } - } - ++tmp; - period = tmp; - while (*period != '.' && *period != ')') ++period ; - *period = '\0'; - *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ - - return 0; -} - -int dReadVector(FILE *fp, int n, int *where, int perline, int persize) -{ - register int i, j, item; - char tmp, buf[100]; - - i = 0; - while (i < n) { - fgets(buf, 100, fp); /* read a line at a time */ - for (j=0; jops; - - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - nextlu = xlusup[jcol]; - - /* - * Process the supernodal portion of L\U[*,j] - */ - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = 0; - ++nextlu; - } - - xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ - - if ( fsupc < jcol ) { - - luptr = xlusup[fsupc]; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nsupc = jcol - fsupc; /* Excluding jcol */ - ufirst = xlusup[jcol]; /* Points to the beginning of column - jcol in supernode L\U(jsupno). */ - nrow = nsupr - nsupc; - - ops[TRSV] += nsupc * (nsupc - 1); - ops[GEMV] += 2 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&nsupc,&lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow,&nsupc,&alpha, - &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - sludlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - sludmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], &tempv[0] ); - - /* Scatter tempv[*] into lusup[*] */ - iptr = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - lusup[iptr++] -= tempv[i]; - tempv[i] = 0.0; - } -#endif - - } - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -int -dsnode_dfs ( - const int jcol, /* in - start of the supernode */ - const int kcol, /* in - end of the supernode */ - const int *asub, /* in */ - const int *xa_begin, /* in */ - const int *xa_end, /* in */ - int *xprune, /* out */ - int *marker, /* modified */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* Purpose - * ======= - * dsnode_dfs() - Determine the union of the row structures of those - * columns within the relaxed snode. - * Note: The relaxed snodes are leaves of the supernodal etree, therefore, - * the portion outside the rectangular supernode must be zero. - * - * Return value - * ============ - * 0 success; - * >0 number of bytes allocated when run out of memory. - * - */ - register int i, k, ifrom, ito, nextl, new_next; - int nsuper, krow, kmark, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - int nzlmax; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - nsuper = ++supno[jcol]; /* Next available supernode number */ - nextl = xlsub[jcol]; - - for (i = jcol; i <= kcol; i++) { - /* For each nonzero in A[*,i] */ - for (k = xa_begin[i]; k < xa_end[i]; k++) { - krow = asub[k]; - kmark = marker[krow]; - if ( kmark != kcol ) { /* First time visit krow */ - marker[krow] = kcol; - lsub[nextl++] = krow; - if ( nextl >= nzlmax ) { - if ((mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) - return (mem_error); - lsub = Glu->lsub; - } - } - } - supno[i] = nsuper; - } - - /* Supernode > 1, then make a copy of the subscripts for pruning */ - if ( jcol < kcol ) { - new_next = nextl + (nextl - xlsub[jcol]); - while ( new_next > nzlmax ) { - if ((mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) - return (mem_error); - lsub = Glu->lsub; - } - ito = nextl; - for (ifrom = xlsub[jcol]; ifrom < nextl; ) - lsub[ito++] = lsub[ifrom++]; - for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; - nextl = ito; - } - - xsup[nsuper+1] = kcol + 1; - supno[kcol+1] = nsuper; - xprune[kcol] = nextl; - xlsub[kcol+1] = nextl; - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,487 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: dsp_blas2.c - * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include "slu_ddefs.h" - -#ifndef HYPRE_USING_HYPRE_BLAS -#define USE_VENDOR_BLAS -#endif -#ifdef __cplusplus -extern "C" { -#endif -extern int hypre_F90_NAME_BLAS(dtrsv,DTRSV)(const char *,const char *,const char *,int *,double *,int *,double *,int *); -#ifdef __cplusplus -} -#endif -/* - * Function prototypes - */ -extern void sludusolve(int, int, double*, double*); -extern void sludlsolve(int, int, double*, double*); -extern void sludmatvec(int, int, int, double*, double*, double*); - -int -sp_dtrsv(const char *uplo,const char *trans,const char *diag, SuperMatrix *L, - SuperMatrix *U, double *x, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * sp_dtrsv() solves one of the systems of equations - * A*x = b, or A'*x = b, - * where b and x are n element vectors and A is a sparse unit , or - * non-unit, upper or lower triangular matrix. - * No test for singularity or near-singularity is included in this - * routine. Such tests must be performed before calling this routine. - * - * Parameters - * ========== - * - * uplo - (input) char* - * On entry, uplo specifies whether the matrix is an upper or - * lower triangular matrix as follows: - * uplo = 'U' or 'u' A is an upper triangular matrix. - * uplo = 'L' or 'l' A is a lower triangular matrix. - * - * trans - (input) char* - * On entry, trans specifies the equations to be solved as - * follows: - * trans = 'N' or 'n' A*x = b. - * trans = 'T' or 't' A'*x = b. - * trans = 'C' or 'c' A'*x = b. - * - * diag - (input) char* - * On entry, diag specifies whether or not A is unit - * triangular as follows: - * diag = 'U' or 'u' A is assumed to be unit triangular. - * diag = 'N' or 'n' A is not assumed to be unit - * triangular. - * - * L - (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SC, Dtype = SLU_D, Mtype = TRLU. - * - * U - (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. - * U has types: Stype = NC, Dtype = SLU_D, Mtype = TRU. - * - * x - (input/output) double* - * Before entry, the incremented array X must contain the n - * element right-hand side vector b. On exit, X is overwritten - * with the solution vector x. - * - * info - (output) int* - * If *info = -i, the i-th argument had an illegal value. - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - SCformat *Lstore; - NCformat *Ustore; - double *Lval, *Uval; - int incx = 1; -#ifdef USE_VENDOR_BLAS - int incy = 1; - double alpha = 1.0, beta = 1.0; -#endif - int nrow; - int fsupc, nsupr, nsupc, luptr, istart, irow; - int i, k, iptr, jcol; - double *work; - flops_t solve_ops; - - /* Test the input parameters */ - *info = 0; - if ( !superlu_lsame(uplo,"L") && !superlu_lsame(uplo, "U") ) *info = -1; - else if ( !superlu_lsame(trans, "N") && !superlu_lsame(trans, "T") && - !superlu_lsame(trans, "C")) *info = -2; - else if ( !superlu_lsame(diag, "U") && !superlu_lsame(diag, "N") ) *info = -3; - else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; - else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; - if ( *info ) { - i = -(*info); - superlu_xerbla("sp_dtrsv", &i); - return 0; - } - - Lstore = (SCformat*) L->Store; - Lval = (double*) Lstore->nzval; - Ustore = (NCformat*) U->Store; - Uval = (double*) Ustore->nzval; - solve_ops = 0; - - if ( !(work = doubleCalloc(L->nrow)) ) - ABORT("Malloc fails for work in sp_dtrsv()."); - - if ( superlu_lsame(trans, "N") ) { /* Form x := inv(A)*x. */ - - if ( superlu_lsame(uplo, "L") ) { - /* Form x := inv(L)*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - nrow = nsupr - nsupc; - - solve_ops += nsupc * (nsupc - 1); - solve_ops += 2 * nrow * nsupc; - - if ( nsupc == 1 ) { - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { - irow = L_SUB(iptr); - ++luptr; - x[irow] -= x[fsupc] * Lval[luptr]; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U", &nsupc, - &Lval[luptr], &nsupr, &x[fsupc], &incx); - - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow, &nsupc, - &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#endif -#else - sludlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); - - sludmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], - &x[fsupc], &work[0] ); -#endif - - iptr = istart + nsupc; - for (i = 0; i < nrow; ++i, ++iptr) { - irow = L_SUB(iptr); - x[irow] -= work[i]; /* Scatter */ - work[i] = 0.0; - - } - } - } /* for k ... */ - - } else { - /* Form x := inv(U)*x */ - - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - x[fsupc] /= Lval[luptr]; - for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { - irow = U_SUB(i); - x[irow] -= x[fsupc] * Uval[i]; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("U","N","N",&nsupc, - &Lval[luptr], &nsupr, &x[fsupc], &incx); -#endif -#else - sludusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); -#endif - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); - i++) { - irow = U_SUB(i); - x[irow] -= x[jcol] * Uval[i]; - } - } - } - } /* for k ... */ - - } - } else { /* Form x := inv(A')*x */ - - if ( superlu_lsame(uplo, "L") ) { - /* Form x := inv(L')*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 2 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - x[jcol] -= x[irow] * Lval[i]; - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += nsupc * (nsupc - 1); -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","T","U",&nsupc, - &Lval[luptr], &nsupr, &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := inv(U')*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - x[jcol] -= x[irow] * Uval[i]; - } - } - - solve_ops += nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - x[fsupc] /= Lval[luptr]; - } else { -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - hypre_F90_NAME_BLAS(dtrsv,DTRSV)("U","T","N", &nsupc, - &Lval[luptr], &nsupr, &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } - - stat->ops[SOLVE] += solve_ops; - SUPERLU_FREE(work); - return 0; -} - - - - -int -sp_dgemv(const char *trans, double alpha, SuperMatrix *A, double *x, - int incx, double beta, double *y, int incy) -{ -/* Purpose - ======= - - sp_dgemv() performs one of the matrix-vector operations - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - where alpha and beta are scalars, x and y are vectors and A is a - sparse A->nrow by A->ncol matrix. - - Parameters - ========== - - TRANS - (input) char* - On entry, TRANS specifies the operation to be performed as - follows: - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - ALPHA - (input) double - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. - In the future, more general A can be handled. - - X - (input) double*, array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - - INCX - (input) int - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - - BETA - (input) double - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - - Y - (output) double*, array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - updated vector y. - - INCY - (input) int - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - - ==== Sparse Level 2 Blas routine. -*/ - - /* Local variables */ - NCformat *Astore; - double *Aval; - int info; - double temp; - int lenx, leny, i, j, irow; - int iy, jx, jy, kx, ky; - int notran; - - notran = superlu_lsame(trans, "N"); - Astore = (NCformat*) A->Store; - Aval = (double*) Astore->nzval; - - /* Test the input parameters */ - info = 0; - if ( !notran && !superlu_lsame(trans, "T") && !superlu_lsame(trans, "C")) info = 1; - else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; - else if (incx == 0) info = 5; - else if (incy == 0) info = 8; - if (info != 0) { - superlu_xerbla("sp_dgemv ", &info); - return 0; - } - - /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) - return 0; - - /* Set LENX and LENY, the lengths of the vectors x and y, and set - up the start points in X and Y. */ - if (superlu_lsame(trans, "N")) { - lenx = A->ncol; - leny = A->nrow; - } else { - lenx = A->nrow; - leny = A->ncol; - } - if (incx > 0) kx = 0; - else kx = - (lenx - 1) * incx; - if (incy > 0) ky = 0; - else ky = - (leny - 1) * incy; - - /* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - /* First form y := beta*y. */ - if (beta != 1.) { - if (incy == 1) { - if (beta == 0.) - for (i = 0; i < leny; ++i) y[i] = 0.; - else - for (i = 0; i < leny; ++i) y[i] = beta * y[i]; - } else { - iy = ky; - if (beta == 0.) - for (i = 0; i < leny; ++i) { - y[iy] = 0.; - iy += incy; - } - else - for (i = 0; i < leny; ++i) { - y[iy] = beta * y[iy]; - iy += incy; - } - } - } - - if (alpha == 0.) return 0; - - if ( notran ) { - /* Form y := alpha*A*x + y. */ - jx = kx; - if (incy == 1) { - for (j = 0; j < A->ncol; ++j) { - if (x[jx] != 0.) { - temp = alpha * x[jx]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - y[irow] += temp * Aval[i]; - } - } - jx += incx; - } - } else { - ABORT("Not implemented."); - } - } else { - /* Form y := alpha*A'*x + y. */ - jy = ky; - if (incx == 1) { - for (j = 0; j < A->ncol; ++j) { - temp = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - temp += Aval[i] * x[irow]; - } - y[jy] += alpha * temp; - jy += incy; - } - } else { - ABORT("Not implemented."); - } - } - return 0; -} /* sp_dgemv */ - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas3.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: sp_blas3.c - * Purpose: Sparse BLAS3, using some dense BLAS3 operations. - */ - -#include "slu_ddefs.h" - -int -sp_dgemm(const char *transa,const char *transb, int m, int n, int k, - double alpha, SuperMatrix *A, double *b, int ldb, - double beta, double *c, int ldc) -{ -/* Purpose - ======= - - sp_d performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - (input) char* - On entry, TRANSA specifies the form of op( A ) to be used in - the matrix multiplication as follows: - TRANSA = 'N' or 'n', op( A ) = A. - TRANSA = 'T' or 't', op( A ) = A'. - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - Unchanged on exit. - - TRANSB - (input) char* - On entry, TRANSB specifies the form of op( B ) to be used in - the matrix multiplication as follows: - TRANSB = 'N' or 'n', op( B ) = B. - TRANSB = 'T' or 't', op( B ) = B'. - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - Unchanged on exit. - - M - (input) int - On entry, M specifies the number of rows of the matrix - op( A ) and of the matrix C. M must be at least zero. - Unchanged on exit. - - N - (input) int - On entry, N specifies the number of columns of the matrix - op( B ) and the number of columns of the matrix C. N must be - at least zero. - Unchanged on exit. - - K - (input) int - On entry, K specifies the number of columns of the matrix - op( A ) and the number of rows of the matrix op( B ). K must - be at least zero. - Unchanged on exit. - - ALPHA - (input) double - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. - In the future, more general A can be handled. - - B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - part of the array B must contain the matrix B, otherwise - the leading n by k part of the array B must contain the - matrix B. - Unchanged on exit. - - LDB - (input) int - On entry, LDB specifies the first dimension of B as declared - in the calling (sub) program. LDB must be at least max( 1, n ). - Unchanged on exit. - - BETA - (input) double - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then C need not be set on input. - - C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - contain the matrix C, except when beta is zero, in which - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - ( alpha*op( A )*B + beta*C ). - - LDC - (input) int - On entry, LDC specifies the first dimension of C as declared - in the calling (sub)program. LDC must be at least max(1,m). - Unchanged on exit. - - ==== Sparse Level 3 Blas routine. -*/ - int incx = 1, incy = 1; - int j; - - for (j = 0; j < n; ++j) { - sp_dgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dutil.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,478 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include "slu_ddefs.h" - -void -dCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, - double *nzval, int *rowind, int *colptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NCformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = (NCformat*) A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->rowind = rowind; - Astore->colptr = colptr; -} - -void -dCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, - double *nzval, int *colind, int *rowptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NRformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = (NRformat*) A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->colind = colind; - Astore->rowptr = rowptr; -} - -/* Copy matrix A into matrix B. */ -void -dCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore, *Bstore; - int ncol, nnz, i; - - B->Stype = A->Stype; - B->Dtype = A->Dtype; - B->Mtype = A->Mtype; - B->nrow = A->nrow;; - B->ncol = ncol = A->ncol; - Astore = (NCformat *) A->Store; - Bstore = (NCformat *) B->Store; - Bstore->nnz = nnz = Astore->nnz; - for (i = 0; i < nnz; ++i) - ((double *)Bstore->nzval)[i] = ((double *)Astore->nzval)[i]; - for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; - for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; -} - - -void -dCreate_Dense_Matrix(SuperMatrix *X, int m, int n, double *x, int ldx, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - DNformat *Xstore; - - X->Stype = stype; - X->Dtype = dtype; - X->Mtype = mtype; - X->nrow = m; - X->ncol = n; - X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); - Xstore = (DNformat *) X->Store; - Xstore->lda = ldx; - Xstore->nzval = (double *) x; -} - -void -dCopy_Dense_Matrix(int M, int N, double *X, int ldx, - double *Y, int ldy) -{ -/* - * - * Purpose - * ======= - * - * Copies a two-dimensional matrix X to another matrix Y. - */ - int i, j; - - for (j = 0; j < N; ++j) - for (i = 0; i < M; ++i) - Y[i + j*ldy] = X[i + j*ldx]; -} - -void -dCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, - double *nzval, int *nzval_colptr, int *rowind, - int *rowind_colptr, int *col_to_sup, int *sup_to_col, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - SCformat *Lstore; - - L->Stype = stype; - L->Dtype = dtype; - L->Mtype = mtype; - L->nrow = m; - L->ncol = n; - L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); - if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); - Lstore = (SCformat*) L->Store; - Lstore->nnz = nnz; - Lstore->nsuper = col_to_sup[n]; - Lstore->nzval = nzval; - Lstore->nzval_colptr = nzval_colptr; - Lstore->rowind = rowind; - Lstore->rowind_colptr = rowind_colptr; - Lstore->col_to_sup = col_to_sup; - Lstore->sup_to_col = sup_to_col; - -} - - -/* - * Convert a row compressed storage into a column compressed storage. - */ -void -dCompRow_to_CompCol(int m, int n, int nnz, - double *a, int *colind, int *rowptr, - double **at, int **rowind, int **colptr) -{ - register int i, j, col, relpos; - int *marker; - - /* Allocate storage for another copy of the matrix. */ - *at = (double *) doubleMalloc(nnz); - *rowind = (int *) intMalloc(nnz); - *colptr = (int *) intMalloc(n+1); - marker = (int *) intCalloc(n); - - /* Get counts of each column of A, and set up column pointers */ - for (i = 0; i < m; ++i) - for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; - (*colptr)[0] = 0; - for (j = 0; j < n; ++j) { - (*colptr)[j+1] = (*colptr)[j] + marker[j]; - marker[j] = (*colptr)[j]; - } - - /* Transfer the matrix into the compressed column storage. */ - for (i = 0; i < m; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - col = colind[j]; - relpos = marker[col]; - (*rowind)[relpos] = i; - (*at)[relpos] = a[j]; - ++marker[col]; - } - } - - SUPERLU_FREE(marker); -} - - -void -dPrint_CompCol_Matrix(char *what, SuperMatrix *A) -{ - NCformat *Astore; - register int i,n; - double *dp; - - printf("\nCompCol matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (NCformat *) A->Store; - dp = (double *) Astore->nzval; - printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); - printf("nzval: "); - for (i = 0; i < Astore->colptr[n]; ++i) printf("%f ", dp[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); - printf("\ncolptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); - printf("\n"); - fflush(stdout); -} - -void -dPrint_SuperNode_Matrix(char *what, SuperMatrix *A) -{ - SCformat *Astore; - register int i, j, k, c, d, n, nsup; - double *dp; - int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; - - printf("\nSuperNode matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (SCformat *) A->Store; - dp = (double *) Astore->nzval; - col_to_sup = Astore->col_to_sup; - sup_to_col = Astore->sup_to_col; - rowind_colptr = Astore->rowind_colptr; - rowind = Astore->rowind; - printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", - A->nrow,A->ncol,Astore->nnz,Astore->nsuper); - printf("nzval:\n"); - for (k = 0; k <= Astore->nsuper; ++k) { - c = sup_to_col[k]; - nsup = sup_to_col[k+1] - c; - for (j = c; j < c + nsup; ++j) { - d = Astore->nzval_colptr[j]; - for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { - printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]); - } - } - } -#if 0 - for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); -#endif - printf("\nnzval_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->rowind_colptr[n]; ++i) - printf("%d ", Astore->rowind[i]); - printf("\nrowind_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); - printf("\ncol_to_sup: "); - for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); - printf("\nsup_to_col: "); - for (i = 0; i <= Astore->nsuper+1; ++i) - printf("%d ", sup_to_col[i]); - printf("\n"); - fflush(stdout); -} - -void -dPrint_Dense_Matrix(char *what, SuperMatrix *A) -{ - DNformat *Astore; - register int i, j, lda; - double *dp; - - printf("\nDense matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (DNformat *) A->Store; - lda = Astore->lda; - dp = (double *) Astore->nzval; - printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda); - printf("\nnzval: "); - for (j = 0; j < A->ncol; ++j) { - for (i = 0; i < A->nrow; ++i) printf("%f ", dp[i + j*lda]); - printf("\n"); - } - printf("\n"); - fflush(stdout); -} - -/* - * Diagnostic print of column "jcol" in the U/L factor. - */ -void -dprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) -{ - int i, k, fsupc; - int *xsup, *supno; - int *xlsub, *lsub; - double *lusup; - int *xlusup; - double *ucol; - int *usub, *xusub; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - - printf("%s", msg); - printf("col %d: pivrow %d, supno %d, xprune %d\n", - jcol, pivrow, supno[jcol], xprune[jcol]); - - printf("\tU-col:\n"); - for (i = xusub[jcol]; i < xusub[jcol+1]; i++) - printf("\t%d%10.4f\n", usub[i], ucol[i]); - printf("\tL-col in rectangular snode:\n"); - fsupc = xsup[supno[jcol]]; /* first col of the snode */ - i = xlsub[fsupc]; - k = xlusup[jcol]; - while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { - printf("\t%d\t%10.4f\n", lsub[i], lusup[k]); - i++; k++; - } - fflush(stdout); -} - - -/* - * Check whether tempv[] == 0. This should be true before and after - * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". - */ -void dcheck_tempv(int n, double *tempv) -{ - int i; - - for (i = 0; i < n; i++) { - if (tempv[i] != 0.0) - { - fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]); - ABORT("dcheck_tempv"); - } - } -} - - -void -dGenXtrue(int n, int nrhs, double *x, int ldx) -{ - int i, j; - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) { - x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/ - } -} - -/* - * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's - */ -void -dFillRHS(trans_t trans, int nrhs, double *x, int ldx, - SuperMatrix *A, SuperMatrix *B) -{ - DNformat *Bstore; - double *rhs; - double one = 1.0; - double zero = 0.0; - int ldc; - char transc[1]; - - Bstore = (DNformat*) B->Store; - rhs = (double*) Bstore->nzval; - ldc = Bstore->lda; - - if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; - else *(unsigned char *)transc = 'T'; - - sp_dgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A, - x, ldx, zero, rhs, ldc); - -} - -/* - * Fills a double precision array with a given value. - */ -void -dfill(double *a, int alen, double dval) -{ - register int i; - for (i = 0; i < alen; i++) a[i] = dval; -} - - - -/* - * Check the inf-norm of the error vector - */ -void dinf_norm_error(int nrhs, SuperMatrix *X, double *xtrue) -{ - DNformat *Xstore; - double err, xnorm; - double *Xmat, *soln_work; - int i, j; - - Xstore = (DNformat*) X->Store; - Xmat = (double*) Xstore->nzval; - - for (j = 0; j < nrhs; j++) { - soln_work = &Xmat[j*Xstore->lda]; - err = xnorm = 0.0; - for (i = 0; i < X->nrow; i++) { - err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i])); - xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i])); - } - err = err / xnorm; - printf("||X - Xtrue||/||X|| = %e\n", err); - } -} - - - -/* Print performance of the code. */ -void -dPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, - double rpg, double rcond, double *ferr, - double *berr, char *equed, SuperLUStat_t *stat) -{ - SCformat *Lstore; - NCformat *Ustore; - double *utime; - flops_t *ops; - - utime = stat->utime; - ops = stat->ops; - - if ( utime[FACT] != 0. ) - printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], - ops[FACT]*1e-6/utime[FACT]); - printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); - if ( utime[SOLVE] != 0. ) - printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], - ops[SOLVE]*1e-6/utime[SOLVE]); - - Lstore = (SCformat *) L->Store; - Ustore = (NCformat *) U->Store; - printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); - printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); - printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); - - printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n", - mem_usage->for_lu/1e6, mem_usage->total_needed/1e6, - mem_usage->expansions); - - printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); - printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", - utime[FACT], ops[FACT]*1e-6/utime[FACT], - utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], - utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); - - printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); - printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", - rpg, rcond, ferr[0], berr[0], equed); - -} - - - - -int print_double_vec(char *what, int n, double *vec) -{ - int i; - printf("%s: n %d\n", what, n); - for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]); - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dzsum1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dzsum1.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dzsum1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dzsum1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,84 +0,0 @@ -#include "slu_Cnames.h" -#include "slu_dcomplex.h" - -double dzsum1_(int *n, doublecomplex *cx, int *incx) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - Purpose - ======= - - DZSUM1 takes the sum of the absolute values of a complex - vector and returns a double precision result. - - Based on DZASUM from the Level 1 BLAS. - The change is to use the 'genuine' absolute value. - - Contributed by Nick Higham for use with ZLACON. - - Arguments - ========= - - N (input) INT - The number of elements in the vector CX. - - CX (input) COMPLEX*16 array, dimension (N) - The vector whose elements will be summed. - - INCX (input) INT - The spacing between successive values of CX. INCX > 0. - - ===================================================================== -*/ - - /* Builtin functions */ - double z_abs(doublecomplex *); - - /* Local variables */ - int i, nincx; - double stemp; - - -#define CX(I) cx[(I)-1] - - stemp = 0.; - if (*n <= 0) { - return stemp; - } - if (*incx == 1) { - goto L20; - } - - /* CODE FOR INCREMENT NOT EQUAL TO 1 */ - - nincx = *n * *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - - /* NEXT LINE MODIFIED. */ - - stemp += z_abs(&CX(i)); -/* L10: */ - } - - return stemp; - - /* CODE FOR INCREMENT EQUAL TO 1 */ - -L20: - for (i = 1; i <= *n; ++i) { - - /* NEXT LINE MODIFIED. */ - - stemp += z_abs(&CX(i)); -/* L30: */ - } - - return stemp; - - /* End of DZSUM1 */ - -} /* dzsum1_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/get_perm_c.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/get_perm_c.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/get_perm_c.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/get_perm_c.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,451 +0,0 @@ -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include "slu_ddefs.h" -#include "slu_util.h" -#include "colamd.h" - -extern int genmmd_(int *, int *, int *, int *, int *, int *, int *, - int *, int *, int *, int *, int *); - -void -get_colamd( - const int m, /* number of rows in matrix A. */ - const int n, /* number of columns in matrix A. */ - const int nnz,/* number of nonzeros in matrix A. */ - int *colptr, /* column pointer of size n+1 for matrix A. */ - int *rowind, /* row indices of size nz for matrix A. */ - int *perm_c /* out - the column permutation vector. */ - ) -{ - int Alen, *A, i, info, *p; - double knobs[COLAMD_KNOBS]; - int stats[COLAMD_STATS]; - - Alen = colamd_recommended(nnz, m, n); - - colamd_set_defaults(knobs); - - if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) - ABORT("Malloc fails for A[]"); - if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) - ABORT("Malloc fails for p[]"); - for (i = 0; i <= n; ++i) p[i] = colptr[i]; - for (i = 0; i < nnz; ++i) A[i] = rowind[i]; - info = colamd(m, n, Alen, A, p, knobs, stats); - if ( info == FALSE ) ABORT("COLAMD failed"); - - for (i = 0; i < n; ++i) perm_c[p[i]] = i; - - SUPERLU_FREE(A); - SUPERLU_FREE(p); -} - -void -getata( - const int m, /* number of rows in matrix A. */ - const int n, /* number of columns in matrix A. */ - const int nz, /* number of nonzeros in matrix A */ - int *colptr, /* column pointer of size n+1 for matrix A. */ - int *rowind, /* row indices of size nz for matrix A. */ - int *atanz, /* out - on exit, returns the actual number of - nonzeros in matrix A'*A. */ - int **ata_colptr, /* out - size n+1 */ - int **ata_rowind /* out - size *atanz */ - ) -/* - * Purpose - * ======= - * - * Form the structure of A'*A. A is an m-by-n matrix in column oriented - * format represented by (colptr, rowind). The output A'*A is in column - * oriented format (symmetrically, also row oriented), represented by - * (ata_colptr, ata_rowind). - * - * This routine is modified from GETATA routine by Tim Davis. - * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2, - * i.e., the sum of the square of the row counts. - * - * Questions - * ========= - * o Do I need to withhold the *dense* rows? - * o How do I know the number of nonzeros in A'*A? - * - */ -{ - register int i, j, k, col, num_nz, ti, trow; - int *marker, *b_colptr, *b_rowind; - int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ - - if ( !(marker = (int*) SUPERLU_MALLOC((SUPERLU_MAX(m,n)+1)*sizeof(int))) ) - ABORT("SUPERLU_MALLOC fails for marker[]"); - if ( !(t_colptr = (int*) SUPERLU_MALLOC((m+1) * sizeof(int))) ) - ABORT("SUPERLU_MALLOC t_colptr[]"); - if ( !(t_rowind = (int*) SUPERLU_MALLOC(nz * sizeof(int))) ) - ABORT("SUPERLU_MALLOC fails for t_rowind[]"); - - - /* Get counts of each column of T, and set up column pointers */ - for (i = 0; i < m; ++i) marker[i] = 0; - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) - ++marker[rowind[i]]; - } - t_colptr[0] = 0; - for (i = 0; i < m; ++i) { - t_colptr[i+1] = t_colptr[i] + marker[i]; - marker[i] = t_colptr[i]; - } - - /* Transpose the matrix from A to T */ - for (j = 0; j < n; ++j) - for (i = colptr[j]; i < colptr[j+1]; ++i) { - col = rowind[i]; - t_rowind[marker[col]] = j; - ++marker[col]; - } - - - /* ---------------------------------------------------------------- - compute B = T * A, where column j of B is: - - Struct (B_*j) = UNION ( Struct (T_*k) ) - A_kj != 0 - - do not include the diagonal entry - - ( Partition A as: A = (A_*1, ..., A_*n) - Then B = T * A = (T * A_*1, ..., T * A_*n), where - T * A_*j = (T_*1, ..., T_*m) * A_*j. ) - ---------------------------------------------------------------- */ - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* First pass determines number of nonzeros in B */ - num_nz = 0; - for (j = 0; j < n; ++j) { - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - for (i = colptr[j]; i < colptr[j+1]; ++i) { - /* A_kj is nonzero, add pattern of column T_*k to B_*j */ - k = rowind[i]; - for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { - trow = t_rowind[ti]; - if ( marker[trow] != j ) { - marker[trow] = j; - num_nz++; - } - } - } - } - *atanz = num_nz; - - /* Allocate storage for A'*A */ - if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails for ata_colptr[]"); - if ( *atanz ) { - if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails for ata_rowind[]"); - } - b_colptr = *ata_colptr; /* aliasing */ - b_rowind = *ata_rowind; - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* Compute each column of B, one at a time */ - num_nz = 0; - for (j = 0; j < n; ++j) { - b_colptr[j] = num_nz; - - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - for (i = colptr[j]; i < colptr[j+1]; ++i) { - /* A_kj is nonzero, add pattern of column T_*k to B_*j */ - k = rowind[i]; - for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { - trow = t_rowind[ti]; - if ( marker[trow] != j ) { - marker[trow] = j; - b_rowind[num_nz++] = trow; - } - } - } - } - b_colptr[n] = num_nz; - - SUPERLU_FREE(marker); - SUPERLU_FREE(t_colptr); - SUPERLU_FREE(t_rowind); -} - - -void -at_plus_a( - const int n, /* number of columns in matrix A. */ - const int nz, /* number of nonzeros in matrix A */ - int *colptr, /* column pointer of size n+1 for matrix A. */ - int *rowind, /* row indices of size nz for matrix A. */ - int *bnz, /* out - on exit, returns the actual number of - nonzeros in matrix A'*A. */ - int **b_colptr, /* out - size n+1 */ - int **b_rowind /* out - size *bnz */ - ) -{ -/* - * Purpose - * ======= - * - * Form the structure of A'+A. A is an n-by-n matrix in column oriented - * format represented by (colptr, rowind). The output A'+A is in column - * oriented format (symmetrically, also row oriented), represented by - * (b_colptr, b_rowind). - * - */ - register int i, j, k, col, num_nz; - int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ - int *marker; - - if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails for marker[]"); - if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails for t_colptr[]"); - if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails t_rowind[]"); - - - /* Get counts of each column of T, and set up column pointers */ - for (i = 0; i < n; ++i) marker[i] = 0; - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j+1]; ++i) - ++marker[rowind[i]]; - } - t_colptr[0] = 0; - for (i = 0; i < n; ++i) { - t_colptr[i+1] = t_colptr[i] + marker[i]; - marker[i] = t_colptr[i]; - } - - /* Transpose the matrix from A to T */ - for (j = 0; j < n; ++j) - for (i = colptr[j]; i < colptr[j+1]; ++i) { - col = rowind[i]; - t_rowind[marker[col]] = j; - ++marker[col]; - } - - - /* ---------------------------------------------------------------- - compute B = A + T, where column j of B is: - - Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k) - - do not include the diagonal entry - ---------------------------------------------------------------- */ - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* First pass determines number of nonzeros in B */ - num_nz = 0; - for (j = 0; j < n; ++j) { - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - /* Add pattern of column A_*k to B_*j */ - for (i = colptr[j]; i < colptr[j+1]; ++i) { - k = rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - ++num_nz; - } - } - - /* Add pattern of column T_*k to B_*j */ - for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { - k = t_rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - ++num_nz; - } - } - } - *bnz = num_nz; - - /* Allocate storage for A+A' */ - if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails for b_colptr[]"); - if ( *bnz) { - if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) ) - ABORT("SUPERLU_MALLOC fails for b_rowind[]"); - } - - /* Zero the diagonal flag */ - for (i = 0; i < n; ++i) marker[i] = -1; - - /* Compute each column of B, one at a time */ - num_nz = 0; - for (j = 0; j < n; ++j) { - (*b_colptr)[j] = num_nz; - - /* Flag the diagonal so it's not included in the B matrix */ - marker[j] = j; - - /* Add pattern of column A_*k to B_*j */ - for (i = colptr[j]; i < colptr[j+1]; ++i) { - k = rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - (*b_rowind)[num_nz++] = k; - } - } - - /* Add pattern of column T_*k to B_*j */ - for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { - k = t_rowind[i]; - if ( marker[k] != j ) { - marker[k] = j; - (*b_rowind)[num_nz++] = k; - } - } - } - (*b_colptr)[n] = num_nz; - - SUPERLU_FREE(marker); - SUPERLU_FREE(t_colptr); - SUPERLU_FREE(t_rowind); -} - -void -get_perm_c(int ispec, SuperMatrix *A, int *perm_c) -/* - * Purpose - * ======= - * - * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple - * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'. - * or using approximate minimum degree column ordering by Davis et. al. - * The LU factorization of A*Pc tends to have less fill than the LU - * factorization of A. - * - * Arguments - * ========= - * - * ispec (input) int - * Specifies the type of column ordering to reduce fill: - * = 1: minimum degree on the structure of A^T * A - * = 2: minimum degree on the structure of A^T + A - * = 3: approximate minimum degree for unsymmetric matrices - * If ispec == 0, the natural ordering (i.e., Pc = I) is returned. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A - * can be: Stype = NC; Dtype = _D; Mtype = GE. In the future, - * more general A can be handled. - * - * perm_c (output) int* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - */ -{ - NCformat *Astore = (NCformat*) A->Store; - int m, n, bnz = 0, *b_colptr, i; - int delta, maxint, nofsub, *invp; - int *b_rowind, *dhead, *qsize, *llist, *marker; - double t;//, SuperLU_timer_(); - - m = A->nrow; - n = A->ncol; - - t = SuperLU_timer_(); - switch ( ispec ) { - case 0: /* Natural ordering */ - for (i = 0; i < n; ++i) perm_c[i] = i; -#if ( PRNTlevel>=1 ) - printf("Use natural column ordering.\n"); -#endif - return; - case 1: /* Minimum degree ordering on A'*A */ - getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); -#if ( PRNTlevel>=1 ) - printf("Use minimum degree ordering on A'*A.\n"); -#endif - t = SuperLU_timer_() - t; - /*printf("Form A'*A time = %8.3f\n", t);*/ - break; - case 2: /* Minimum degree ordering on A'+A */ - if ( m != n ) ABORT("Matrix is not square"); - at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); -#if ( PRNTlevel>=1 ) - printf("Use minimum degree ordering on A'+A.\n"); -#endif - t = SuperLU_timer_() - t; - /*printf("Form A'+A time = %8.3f\n", t);*/ - break; - case 3: /* Approximate minimum degree column ordering. */ - get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind, - perm_c); -#if ( PRNTlevel>=1 ) - printf(".. Use approximate minimum degree column ordering.\n"); -#endif - return; - default: - ABORT("Invalid ISPEC"); - } - - if ( bnz != 0 ) { - t = SuperLU_timer_(); - - /* Initialize and allocate storage for GENMMD. */ - delta = 1; /* DELTA is a parameter to allow the choice of nodes - whose degree <= min-degree + DELTA. */ - maxint = 2147483647; /* 2**31 - 1 */ - invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); - if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp."); - dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); - if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead."); - qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); - if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize."); - llist = (int *) SUPERLU_MALLOC(n*sizeof(int)); - if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist."); - marker = (int *) SUPERLU_MALLOC(n*sizeof(int)); - if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker."); - - /* Transform adjacency list into 1-based indexing required by GENMMD.*/ - for (i = 0; i <= n; ++i) ++b_colptr[i]; - for (i = 0; i < bnz; ++i) ++b_rowind[i]; - - genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, - qsize, llist, marker, &maxint, &nofsub); - - /* Transform perm_c into 0-based indexing. */ - for (i = 0; i < n; ++i) --perm_c[i]; - - SUPERLU_FREE(invp); - SUPERLU_FREE(dhead); - SUPERLU_FREE(qsize); - SUPERLU_FREE(llist); - SUPERLU_FREE(marker); - SUPERLU_FREE(b_rowind); - - t = SuperLU_timer_() - t; - /* printf("call GENMMD time = %8.3f\n", t);*/ - - } else { /* Empty adjacency structure */ - for (i = 0; i < n; ++i) perm_c[i] = i; - } - - SUPERLU_FREE(b_colptr); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,116 +0,0 @@ -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -void -heap_relax_snode ( - const int n, - int *et, /* column elimination tree */ - const int relax_columns, /* max no of columns allowed in a - relaxed snode */ - int *descendants, /* no of descendants of each node - in the etree */ - int *relax_end /* last column in a supernode */ - ) -{ -/* - * Purpose - * ======= - * relax_snode() - Identify the initial relaxed supernodes, assuming that - * the matrix has been reordered according to the postorder of the etree. - * - */ - register int i, j, k, l, parent; - register int snode_start; /* beginning of a snode */ - int *et_save, *post, *inv_post, *iwork; - int nsuper_et = 0, nsuper_et_post = 0; - - /* The etree may not be postordered, but is heap ordered. */ - - iwork = (int*) intMalloc(3*n+2); - if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]"); - inv_post = iwork + n+1; - et_save = inv_post + n+1; - - /* Post order etree */ - post = (int *) TreePostorder(n, et); - for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; - - /* Renumber etree in postorder */ - for (i = 0; i < n; ++i) { - iwork[post[i]] = post[et[i]]; - et_save[i] = et[i]; /* Save the original etree */ - } - for (i = 0; i < n; ++i) et[i] = iwork[i]; - - /* Compute the number of descendants of each node in the etree */ - ifill (relax_end, n, EMPTY); - for (j = 0; j < n; j++) descendants[j] = 0; - for (j = 0; j < n; j++) { - parent = et[j]; - if ( parent != n ) /* not the dummy root */ - descendants[parent] += descendants[j] + 1; - } - - /* Identify the relaxed supernodes by postorder traversal of the etree. */ - for (j = 0; j < n; ) { - parent = et[j]; - snode_start = j; - while ( parent != n && descendants[parent] < relax_columns ) { - j = parent; - parent = et[j]; - } - /* Found a supernode in postordered etree; j is the last column. */ - ++nsuper_et_post; - k = n; - for (i = snode_start; i <= j; ++i) - k = SUPERLU_MIN(k, inv_post[i]); - l = inv_post[j]; - if ( (l - k) == (j - snode_start) ) { - /* It's also a supernode in the original etree */ - relax_end[k] = l; /* Last column is recorded */ - ++nsuper_et; - } else { - for (i = snode_start; i <= j; ++i) { - l = inv_post[i]; - if ( descendants[i] == 0 ) relax_end[l] = l; - } - } - j++; - /* Search for a new leaf */ - while ( descendants[j] != 0 && j < n ) j++; - } - -#if ( PRNTlevel>=1 ) - printf(".. heap_snode_relax:\n" - "\tNo of relaxed snodes in postordered etree:\t%d\n" - "\tNo of relaxed snodes in original etree:\t%d\n", - nsuper_et_post, nsuper_et); -#endif - - /* Recover the original etree */ - for (i = 0; i < n; ++i) et[i] = et_save[i]; - - SUPERLU_FREE(post); - SUPERLU_FREE(iwork); -} - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/icmax1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/icmax1.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/icmax1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/icmax1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,109 +0,0 @@ -#include -#include "slu_scomplex.h" -#include "slu_Cnames.h" - -int icmax1_(int *n, complex *cx, int *incx) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - ICMAX1 finds the index of the element whose real part has maximum - absolute value. - - Based on ICAMAX from Level 1 BLAS. - The change is to use the 'genuine' absolute value. - - Contributed by Nick Higham for use with CLACON. - - Arguments - ========= - - N (input) INT - The number of elements in the vector CX. - - CX (input) COMPLEX array, dimension (N) - The vector whose elements will be summed. - - INCX (input) INT - The spacing between successive values of CX. INCX >= 1. - - ===================================================================== - - - - NEXT LINE IS THE ONLY MODIFICATION. - - - Parameter adjustments - Function Body */ - /* System generated locals */ - int ret_val, i__1, i__2; - float r__1; - /* Local variables */ - static float smax; - static int i, ix; - - -#define CX(I) cx[(I)-1] - - - ret_val = 0; - if (*n < 1) { - return ret_val; - } - ret_val = 1; - if (*n == 1) { - return ret_val; - } - if (*incx == 1) { - goto L30; - } - -/* CODE FOR INCREMENT NOT EQUAL TO 1 */ - - ix = 1; - smax = (r__1 = CX(1).r, fabs(r__1)); - ix += *incx; - i__1 = *n; - for (i = 2; i <= *n; ++i) { - i__2 = ix; - if ((r__1 = CX(ix).r, fabs(r__1)) <= smax) { - goto L10; - } - ret_val = i; - i__2 = ix; - smax = (r__1 = CX(ix).r, fabs(r__1)); -L10: - ix += *incx; -/* L20: */ - } - return ret_val; - -/* CODE FOR INCREMENT EQUAL TO 1 */ - -L30: - smax = (r__1 = CX(1).r, fabs(r__1)); - i__1 = *n; - for (i = 2; i <= *n; ++i) { - i__2 = i; - if ((r__1 = CX(i).r, fabs(r__1)) <= smax) { - goto L40; - } - ret_val = i; - i__2 = i; - smax = (r__1 = CX(i).r, fabs(r__1)); -L40: - ; - } - return ret_val; - -/* End of ICMAX1 */ - -} /* icmax1_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/izmax1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/izmax1.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/izmax1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/izmax1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,103 +0,0 @@ -#include -#include "slu_Cnames.h" -#include "slu_dcomplex.h" - -int -izmax1_(int *n, doublecomplex *cx, int *incx) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - IZMAX1 finds the index of the element whose real part has maximum - absolute value. - - Based on IZAMAX from Level 1 BLAS. - The change is to use the 'genuine' absolute value. - - Contributed by Nick Higham for use with ZLACON. - - Arguments - ========= - - N (input) INT - The number of elements in the vector CX. - - CX (input) COMPLEX*16 array, dimension (N) - The vector whose elements will be summed. - - INCX (input) INT - The spacing between successive values of CX. INCX >= 1. - - ===================================================================== -*/ - - /* System generated locals */ - int ret_val, i__1, i__2; - double d__1; - - /* Local variables */ - double smax; - int i, ix; - -#define CX(I) cx[(I)-1] - - ret_val = 0; - if (*n < 1) { - return ret_val; - } - ret_val = 1; - if (*n == 1) { - return ret_val; - } - if (*incx == 1) { - goto L30; - } - -/* CODE FOR INCREMENT NOT EQUAL TO 1 */ - - ix = 1; - smax = (d__1 = CX(1).r, fabs(d__1)); - ix += *incx; - i__1 = *n; - for (i = 2; i <= *n; ++i) { - i__2 = ix; - if ((d__1 = CX(ix).r, fabs(d__1)) <= smax) { - goto L10; - } - ret_val = i; - i__2 = ix; - smax = (d__1 = CX(ix).r, fabs(d__1)); -L10: - ix += *incx; -/* L20: */ - } - return ret_val; - -/* CODE FOR INCREMENT EQUAL TO 1 */ - -L30: - smax = (d__1 = CX(1).r, fabs(d__1)); - i__1 = *n; - for (i = 2; i <= *n; ++i) { - i__2 = i; - if ((d__1 = CX(i).r, fabs(d__1)) <= smax) { - goto L40; - } - ret_val = i; - i__2 = i; - smax = (d__1 = CX(i).r, fabs(d__1)); -L40: - ; - } - return ret_val; - -/* End of IZMAX1 */ - -} /* izmax1_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/lsame.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/lsame.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/lsame.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/lsame.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -#include "slu_Cnames.h" - -int lsame_(char *ca, char *cb) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - Purpose - ======= - - LSAME returns .TRUE. if CA is the same letter as CB regardless of case. - - Arguments - ========= - - CA (input) CHARACTER*1 - CB (input) CHARACTER*1 - CA and CB specify the single characters to be compared. - - ===================================================================== -*/ - - /* System generated locals */ - int ret_val; - - /* Local variables */ - int inta, intb, zcode; - - ret_val = *(unsigned char *)ca == *(unsigned char *)cb; - if (ret_val) { - return ret_val; - } - - /* Now test for equivalence if both characters are alphabetic. */ - - zcode = 'Z'; - - /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime - machines, on which ICHAR returns a value with bit 8 set. - ICHAR('A') on Prime machines returns 193 which is the same as - ICHAR('A') on an EBCDIC machine. */ - - inta = *(unsigned char *)ca; - intb = *(unsigned char *)cb; - - if (zcode == 90 || zcode == 122) { - /* ASCII is assumed - ZCODE is the ASCII code of either lower or - upper case 'Z'. */ - if (inta >= 97 && inta <= 122) inta += -32; - if (intb >= 97 && intb <= 122) intb += -32; - - } else if (zcode == 233 || zcode == 169) { - /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or - upper case 'Z'. */ - if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta - >= 162 && inta <= 169) - inta += 64; - if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb - >= 162 && intb <= 169) - intb += 64; - } else if (zcode == 218 || zcode == 250) { - /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code - plus 128 of either lower or upper case 'Z'. */ - if (inta >= 225 && inta <= 250) inta += -32; - if (intb >= 225 && intb <= 250) intb += -32; - } - ret_val = inta == intb; - return ret_val; - -} /* lsame_ */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Makefile hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Makefile --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,218 +0,0 @@ -# makefile for sparse supernodal LU, implemented in ANSI C -include ../make.inc - -####################################################################### -# This is the makefile to create a library for SuperLU. -# The files are organized as follows: -# -# ALLAUX -- Auxiliary routines called from all precisions of SuperLU -# LAAUX -- LAPACK auxiliary routines called from all precisions -# SLASRC -- LAPACK single precision real routines -# DLASRC -- LAPACK double precision real routines -# CLASRC -- LAPACK single precision complex routines -# ZLASRC -- LAPACK double precision complex routines -# SCLAUX -- LAPACK Auxiliary routines called from both real and complex -# DZLAUX -- LAPACK Auxiliary routines called from both double precision -# and complex*16 -# SLUSRC -- Single precision real SuperLU routines -# DLUSRC -- Double precision real SuperLU routines -# CLUSRC -- Single precision complex SuperLU routines -# ZLUSRC -- Double precision complex SuperLU routines -# -# The library can be set up to include routines for any combination -# of the four precisions. To create or add to the library, enter make -# followed by one or more of the precisions desired. Some examples: -# make single -# make single double -# make single double complex complex16 -# Alternatively, the command -# make -# without any arguments creates a library of all four precisions. -# The library is called -# superlu.a -# and is created at the next higher directory level. -# -# To remove the object files after the library is created, enter -# make clean -# -####################################################################### - -####################################### -### added for hypre -####################################### -include ../../../config/Makefile.config -CINCLUDES = ${INCLUDES} ${MPIINCLUDE} -C_COMPILE_FLAGS = \ - -I$(srcdir)\ - -I$(srcdir)/../../../\ - -I$(srcdir)/../../../utilities\ - -I$(srcdir)/../../../blas\ - -I$(srcdir)/../../../lapack\ - ${CINCLUDES} - -HEADERS =\ - colamd.h\ - slu_ddefs.h\ - slu_util.h\ - supermatrix.h -####################################### - -### LAPACK -LAAUX = lsame.o xerbla.o -SLASRC = slacon.o -DLASRC = dlacon.o -CLASRC = clacon.o scsum1.o icmax1.o -ZLASRC = zlacon.o dzsum1.o izmax1.o -SCLAUX = slamch.o -DZLAUX = dlamch.o - -### SuperLU -ALLAUX = superlu_timer.o util.o memory.o get_perm_c.o mmd.o \ - sp_coletree.o sp_preorder.o sp_ienv.o relax_snode.o \ - heap_relax_snode.o colamd.o - -SLUSRC = \ - sgssv.o sgssvx.o \ - ssp_blas2.o ssp_blas3.o sgscon.o \ - slangs.o sgsequ.o slaqgs.o spivotgrowth.o \ - sgsrfs.o sgstrf.o sgstrs.o scopy_to_ucol.o \ - ssnode_dfs.o ssnode_bmod.o \ - spanel_dfs.o spanel_bmod.o sreadhb.o \ - scolumn_dfs.o scolumn_bmod.o spivotL.o spruneL.o \ - smemory.o sutil.o smyblas2.o - -DLUSRC = \ - dgssv.o dgssvx.o \ - dsp_blas2.o dsp_blas3.o dgscon.o \ - dlangs.o dgsequ.o dlaqgs.o dpivotgrowth.o \ - dgsrfs.o dgstrf.o dgstrs.o dcopy_to_ucol.o \ - dsnode_dfs.o dsnode_bmod.o \ - dpanel_dfs.o dpanel_bmod.o dreadhb.o \ - dcolumn_dfs.o dcolumn_bmod.o dpivotL.o dpruneL.o \ - dmemory.o dutil.o dmyblas2.o - -CLUSRC = \ - scomplex.o cgssv.o cgssvx.o csp_blas2.o csp_blas3.o cgscon.o \ - clangs.o cgsequ.o claqgs.o cpivotgrowth.o \ - cgsrfs.o cgstrf.o cgstrs.o ccopy_to_ucol.o \ - csnode_dfs.o csnode_bmod.o \ - cpanel_dfs.o cpanel_bmod.o creadhb.o \ - ccolumn_dfs.o ccolumn_bmod.o cpivotL.o cpruneL.o \ - cmemory.o cutil.o cmyblas2.o - -ZLUSRC = \ - dcomplex.o zgssv.o zgssvx.o zsp_blas2.o zsp_blas3.o zgscon.o \ - zlangs.o zgsequ.o zlaqgs.o zpivotgrowth.o \ - zgsrfs.o zgstrf.o zgstrs.o zcopy_to_ucol.o \ - zsnode_dfs.o zsnode_bmod.o \ - zpanel_dfs.o zpanel_bmod.o zreadhb.o \ - zcolumn_dfs.o zcolumn_bmod.o zpivotL.o zpruneL.o \ - zmemory.o zutil.o zmyblas2.o - -####################################### -### changed for hypre -####################################### - -FILES =\ - colamd.c \ - dcolumn_bmod.c \ - dcolumn_dfs.c \ - dcopy_to_ucol.c \ - dgscon.c \ - dgsequ.c \ - dgsrfs.c \ - dgssv.c \ - dgssvx.c \ - dgstrf.c \ - dgstrs.c \ - dlacon.c \ - dlangs.c \ - dlaqgs.c \ - dmemory.c \ - dpanel_bmod.c \ - dpanel_dfs.c \ - dpivotgrowth.c \ - dpivotL.c \ - dpruneL.c \ - dreadhb.c \ - dsnode_bmod.c \ - dsnode_dfs.c \ - dsp_blas2.c \ - dsp_blas3.c \ - dutil.c \ - get_perm_c.c \ - heap_relax_snode.o \ - memory.o \ - mmd.c \ - relax_snode.c \ - sp_coletree.c \ - sp_ienv.c \ - sp_preorder.c \ - superlu_timer.c \ - slu_util.c - -OBJS = ${FILES:.c=.o} - -all: libHYPRE_superlu${HYPRE_LIB_SUFFIX} - cp -fp *.h $(HYPRE_BUILD_DIR)/include -# cp -fp libHYPRE* $(HYPRE_BUILD_DIR)/lib - -install: libHYPRE_superlu${HYPRE_LIB_SUFFIX} - cp -f *.h $(HYPRE_INC_INSTALL) -# cp -f libHYPRE* $(HYPRE_LIB_INSTALL) - @echo " " - -clean: - rm -f *.o libHYPRE* - rm -rf pchdir tca.map *inslog* -distclean: clean - -libHYPRE_superlu.a: ${OBJS} - @echo "Building $@ ... " - ${AR} $@ ${OBJS} - ${RANLIB} $@ - -libHYPRE_superlu.so: ${OBJS} - @echo "Building $@ ... " - ${BUILD_CC_SHARED} -o $@ ${OBJS} - -${OBJS}: ${HEADERS} - -####################################### - -all4: single double complex complex16 - -single: $(SLUSRC) $(ALLAUX) $(LAAUX) $(SLASRC) $(SCLAUX) - $(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \ - $(SLUSRC) $(ALLAUX) $(LAAUX) $(SLASRC) $(SCLAUX) - $(RANLIB) ../$(SUPERLULIB) - -double: $(DLUSRC) $(ALLAUX) $(LAAUX) $(DLASRC) $(DZLAUX) - $(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \ - $(DLUSRC) $(ALLAUX) $(LAAUX) $(DLASRC) $(DZLAUX) - $(RANLIB) ../$(SUPERLULIB) - -complex: $(CLUSRC) $(ALLAUX) $(LAAUX) $(CLASRC) $(SCLAUX) - $(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \ - $(CLUSRC) $(ALLAUX) $(LAAUX) $(CLASRC) $(SCLAUX) - $(RANLIB) ../$(SUPERLULIB) - -complex16: $(ZLUSRC) $(ALLAUX) $(LAAUX) $(ZLASRC) $(DZLAUX) - $(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \ - $(ZLUSRC) $(ALLAUX) $(LAAUX) $(ZLASRC) $(DZLAUX) - $(RANLIB) ../$(SUPERLULIB) - - -################################## -# Do not optimize these routines # -################################## -slamch.o: slamch.c ; $(CC) -g -c ${SHARED_COMPILE_FLAG} $< -dlamch.o: dlamch.c ; $(CC) -g -c ${SHARED_COMPILE_FLAG} $< -#superlu_timer.o: superlu_timer.c ; $(CC) -g -c -DNO_TIMER ${SHARED_COMPILE_FLAG} $< -################################## - -#.c.o: -# $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE) - -#clean: -# rm -f *.o ../libsuperlu_3.0.a diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/memory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/memory.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/memory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/memory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,208 +0,0 @@ -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/** Precision-independent memory-related routines. - (Shared by [sdcz]memory.c) **/ - -#include "slu_ddefs.h" - - -#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ -int superlu_malloc_total = 0; - -#define PAD_FACTOR 2 -#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ -/* size_t is usually defined as 'unsigned long' */ - -void *superlu_malloc(size_t size) -{ - char *buf; - - buf = (char *) malloc(size + DWORD); - if ( !buf ) { - printf("superlu_malloc fails: malloc_total %.0f MB, size %ld\n", - superlu_malloc_total*1e-6, size); - ABORT("superlu_malloc: out of memory"); - } - - ((int_t *) buf)[0] = size; -#if 0 - superlu_malloc_total += size + DWORD; -#else - superlu_malloc_total += size; -#endif - return (void *) (buf + DWORD); -} - -void superlu_free(void *addr) -{ - char *p = ((char *) addr) - DWORD; - - if ( !addr ) - ABORT("superlu_free: tried to free NULL pointer"); - - if ( !p ) - ABORT("superlu_free: tried to free NULL+DWORD pointer"); - - { - int_t n = ((int_t *) p)[0]; - - if ( !n ) - ABORT("superlu_free: tried to free a freed pointer"); - *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ -#if 0 - superlu_malloc_total -= (n + DWORD); -#else - superlu_malloc_total -= n; -#endif - - if ( superlu_malloc_total < 0 ) - ABORT("superlu_malloc_total went negative!"); - - /*free (addr);*/ - free (p); - } - -} - -#else /* production mode */ - -void *superlu_malloc(size_t size) -{ - void *buf; - buf = (void *) malloc(size); - return (buf); -} - -void superlu_free(void *addr) -{ - free (addr); -} - -#endif - - -/* - * Set up pointers for integer working arrays. - */ -void -SetIWork(int m, int n, int panel_size, int *iworkptr, int **segrep, - int **parent, int **xplore, int **repfnz, int **panel_lsub, - int **xprune, int **marker) -{ - *segrep = iworkptr; - *parent = iworkptr + m; - *xplore = *parent + m; - *repfnz = *xplore + m; - *panel_lsub = *repfnz + panel_size * m; - *xprune = *panel_lsub + panel_size * m; - *marker = *xprune + n; - ifill (*repfnz, m * panel_size, EMPTY); - ifill (*panel_lsub, m * panel_size, EMPTY); -} - - -void -copy_mem_int(int howmany, void *oldM, void *newM) -{ - register int i; - int *iold = (int*) oldM; - int *inew = (int*) newM; - for (i = 0; i < howmany; i++) inew[i] = iold[i]; -} - - -void -user_bcopy(char *src, char *dest, int bytes) -{ - char *s_ptr, *d_ptr; - - s_ptr = src + bytes - 1; - d_ptr = dest + bytes - 1; - for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; -} - - - -int *intMalloc(int n) -{ - int *buf; - buf = (int *) SUPERLU_MALLOC(n * sizeof(int)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC fails for buf in intMalloc()"); - } - return (buf); -} - -int *intCalloc(int n) -{ - int *buf; - register int i; - buf = (int *) SUPERLU_MALLOC(n * sizeof(int)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC fails for buf in intCalloc()"); - } - for (i = 0; i < n; ++i) buf[i] = 0; - return (buf); -} - - - -#if 0 -check_expanders() -{ - int p; - printf("Check expanders:\n"); - for (p = 0; p < NO_MEMTYPE; p++) { - printf("type %d, size %d, mem %d\n", - p, expanders[p].size, (int)expanders[p].mem); - } - - return 0; -} - - -StackInfo() -{ - printf("Stack: size %d, used %d, top1 %d, top2 %d\n", - stack.size, stack.used, stack.top1, stack.top2); - return 0; -} - - - -PrintStack(char *msg, GlobalLU_t *Glu) -{ - int i; - int *xlsub, *lsub, *xusub, *usub; - - xlsub = Glu->xlsub; - lsub = Glu->lsub; - xusub = Glu->xusub; - usub = Glu->usub; - - printf("%s\n", msg); - -/* printf("\nUCOL: "); - for (i = 0; i < xusub[ndim]; ++i) - printf("%f ", ucol[i]); - - printf("\nLSUB: "); - for (i = 0; i < xlsub[ndim]; ++i) - printf("%d ", lsub[i]); - - printf("\nUSUB: "); - for (i = 0; i < xusub[ndim]; ++i) - printf("%d ", usub[i]); - - printf("\n");*/ - return 0; -} -#endif - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/mmd.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/mmd.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/mmd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/mmd.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1012 +0,0 @@ - -typedef int shortint; - -/* *************************************************************** */ -/* *************************************************************** */ -/* **** GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE **** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */ -/* ALGORITHM. IT MAKES USE OF THE IMPLICIT REPRESENTATION */ -/* OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */ -/* NOTION OF INDISTINGUISHABLE NODES. IT ALSO IMPLEMENTS */ -/* THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */ -/* EXTERNAL DEGREE. */ -/* --------------------------------------------- */ -/* CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */ -/* DESTROYED. */ -/* --------------------------------------------- */ - -/* INPUT PARAMETERS - */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */ -/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ -/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */ -/* (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */ -/* NODES. */ - -/* OUTPUT PARAMETERS - */ -/* PERM - THE MINIMUM DEGREE ORDERING. */ -/* INVP - THE INVERSE OF PERM. */ -/* NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */ -/* SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */ - -/* WORKING PARAMETERS - */ -/* DHEAD - VECTOR FOR HEAD OF DEGREE LISTS. */ -/* INVP - USED TEMPORARILY FOR DEGREE FORWARD LINK. */ -/* PERM - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */ -/* QSIZE - VECTOR FOR SIZE OF SUPERNODES. */ -/* LLIST - VECTOR FOR TEMPORARY LINKED LISTS. */ -/* MARKER - A TEMPORARY MARKER VECTOR. */ - -/* PROGRAM SUBROUTINES - */ -/* MMDELM, MMDINT, MMDNUM, MMDUPD. */ - -/* *************************************************************** */ - -/* Subroutine */ int genmmd_(int *neqns, int *xadj, shortint *adjncy, - shortint *invp, shortint *perm, int *delta, shortint *dhead, - shortint *qsize, shortint *llist, shortint *marker, int *maxint, - int *nofsub) -{ - /* System generated locals */ - int i__1; - - /* Local variables */ - static int mdeg, ehead, i, mdlmt, mdnode; - extern /* Subroutine */ int mmdelm_(int *, int *, shortint *, - shortint *, shortint *, shortint *, shortint *, shortint *, - shortint *, int *, int *), mmdupd_(int *, int *, - int *, shortint *, int *, int *, shortint *, shortint - *, shortint *, shortint *, shortint *, shortint *, int *, - int *), mmdint_(int *, int *, shortint *, shortint *, - shortint *, shortint *, shortint *, shortint *, shortint *), - mmdnum_(int *, shortint *, shortint *, shortint *); - static int nextmd, tag, num; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dhead; - --perm; - --invp; - --adjncy; - --xadj; - - /* Function Body */ - if (*neqns <= 0) { - return 0; - } - -/* ------------------------------------------------ */ -/* INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */ -/* ------------------------------------------------ */ - *nofsub = 0; - mmdint_(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & - qsize[1], &llist[1], &marker[1]); - -/* ---------------------------------------------- */ -/* NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */ -/* ---------------------------------------------- */ - num = 1; - -/* ----------------------------- */ -/* ELIMINATE ALL ISOLATED NODES. */ -/* ----------------------------- */ - nextmd = dhead[1]; -L100: - if (nextmd <= 0) { - goto L200; - } - mdnode = nextmd; - nextmd = invp[mdnode]; - marker[mdnode] = *maxint; - invp[mdnode] = -num; - ++num; - goto L100; - -L200: -/* ---------------------------------------- */ -/* SEARCH FOR NODE OF THE MINIMUM DEGREE. */ -/* MDEG IS THE CURRENT MINIMUM DEGREE; */ -/* TAG IS USED TO FACILITATE MARKING NODES. */ -/* ---------------------------------------- */ - if (num > *neqns) { - goto L1000; - } - tag = 1; - dhead[1] = 0; - mdeg = 2; -L300: - if (dhead[mdeg] > 0) { - goto L400; - } - ++mdeg; - goto L300; -L400: -/* ------------------------------------------------- */ -/* USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */ -/* WHEN A DEGREE UPDATE IS TO BE PERFORMED. */ -/* ------------------------------------------------- */ - mdlmt = mdeg + *delta; - ehead = 0; - -L500: - mdnode = dhead[mdeg]; - if (mdnode > 0) { - goto L600; - } - ++mdeg; - if (mdeg > mdlmt) { - goto L900; - } - goto L500; -L600: -/* ---------------------------------------- */ -/* REMOVE MDNODE FROM THE DEGREE STRUCTURE. */ -/* ---------------------------------------- */ - nextmd = invp[mdnode]; - dhead[mdeg] = nextmd; - if (nextmd > 0) { - perm[nextmd] = -mdeg; - } - invp[mdnode] = -num; - *nofsub = *nofsub + mdeg + qsize[mdnode] - 2; - if (num + qsize[mdnode] > *neqns) { - goto L1000; - } -/* ---------------------------------------------- */ -/* ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */ -/* TRANSFORMATION. RESET TAG VALUE IF NECESSARY. */ -/* ---------------------------------------------- */ - ++tag; - if (tag < *maxint) { - goto L800; - } - tag = 1; - i__1 = *neqns; - for (i = 1; i <= i__1; ++i) { - if (marker[i] < *maxint) { - marker[i] = 0; - } -/* L700: */ - } -L800: - mmdelm_(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & - qsize[1], &llist[1], &marker[1], maxint, &tag); - num += qsize[mdnode]; - llist[mdnode] = ehead; - ehead = mdnode; - if (*delta >= 0) { - goto L500; - } -L900: -/* ------------------------------------------- */ -/* UPDATE DEGREES OF THE NODES INVOLVED IN THE */ -/* MINIMUM DEGREE NODES ELIMINATION. */ -/* ------------------------------------------- */ - if (num > *neqns) { - goto L1000; - } - mmdupd_(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], & - invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, &tag) - ; - goto L300; - -L1000: - mmdnum_(neqns, &perm[1], &invp[1], &qsize[1]); - return 0; - -} /* genmmd_ */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* *** MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION *** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */ -/* MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */ -/* ALGORITHM. */ - -/* INPUT PARAMETERS - */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ - -/* OUTPUT PARAMETERS - */ -/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ -/* QSIZE - SIZE OF SUPERNODE (INITIALIZED TO ONE). */ -/* LLIST - LINKED LIST. */ -/* MARKER - MARKER VECTOR. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdint_(int *neqns, int *xadj, shortint *adjncy, - shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, - shortint *llist, shortint *marker) -{ - /* System generated locals */ - int i__1; - - /* Local variables */ - static int ndeg, node, fnode; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dbakw; - --dforw; - --dhead; - --adjncy; - --xadj; - - /* Function Body */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - dhead[node] = 0; - qsize[node] = 1; - marker[node] = 0; - llist[node] = 0; -/* L100: */ - } -/* ------------------------------------------ */ -/* INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */ -/* ------------------------------------------ */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - ndeg = xadj[node + 1] - xadj[node] + 1; - fnode = dhead[ndeg]; - dforw[node] = fnode; - dhead[ndeg] = node; - if (fnode > 0) { - dbakw[fnode] = node; - } - dbakw[node] = -ndeg; -/* L200: */ - } - return 0; - -} /* mmdint_ */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* ** MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION *** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */ -/* MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */ -/* IS STORED IN THE QUOTIENT GRAPH FORMAT. IT ALSO */ -/* TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */ -/* ELIMINATION GRAPH. */ - -/* INPUT PARAMETERS - */ -/* MDNODE - NODE OF MINIMUM DEGREE. */ -/* MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */ -/* INT. */ -/* TAG - TAG VALUE. */ - -/* UPDATED PARAMETERS - */ -/* (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */ -/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ -/* QSIZE - SIZE OF SUPERNODE. */ -/* MARKER - MARKER VECTOR. */ -/* LLIST - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdelm_(int *mdnode, int *xadj, shortint *adjncy, - shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, - shortint *llist, shortint *marker, int *maxint, int *tag) -{ - /* System generated locals */ - int i__1, i__2; - - /* Local variables */ - static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, - istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - -/* ----------------------------------------------- */ -/* FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */ -/* ----------------------------------------------- */ - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dbakw; - --dforw; - --dhead; - --adjncy; - --xadj; - - /* Function Body */ - marker[*mdnode] = *tag; - istrt = xadj[*mdnode]; - istop = xadj[*mdnode + 1] - 1; -/* ------------------------------------------------------- */ -/* ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */ -/* NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */ -/* FOR THE NEXT REACHABLE NODE. */ -/* ------------------------------------------------------- */ - elmnt = 0; - rloc = istrt; - rlmt = istop; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - nabor = adjncy[i]; - if (nabor == 0) { - goto L300; - } - if (marker[nabor] >= *tag) { - goto L200; - } - marker[nabor] = *tag; - if (dforw[nabor] < 0) { - goto L100; - } - adjncy[rloc] = nabor; - ++rloc; - goto L200; -L100: - llist[nabor] = elmnt; - elmnt = nabor; -L200: - ; - } -L300: -/* ----------------------------------------------------- */ -/* MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */ -/* ----------------------------------------------------- */ - if (elmnt <= 0) { - goto L1000; - } - adjncy[rlmt] = -elmnt; - link = elmnt; -L400: - jstrt = xadj[link]; - jstop = xadj[link + 1] - 1; - i__1 = jstop; - for (j = jstrt; j <= i__1; ++j) { - node = adjncy[j]; - link = -node; - if (node < 0) { - goto L400; - } else if (node == 0) { - goto L900; - } else { - goto L500; - } -L500: - if (marker[node] >= *tag || dforw[node] < 0) { - goto L800; - } - marker[node] = *tag; -/* --------------------------------- */ -/* USE STORAGE FROM ELIMINATED NODES */ -/* IF NECESSARY. */ -/* --------------------------------- */ -L600: - if (rloc < rlmt) { - goto L700; - } - link = -adjncy[rlmt]; - rloc = xadj[link]; - rlmt = xadj[link + 1] - 1; - goto L600; -L700: - adjncy[rloc] = node; - ++rloc; -L800: - ; - } -L900: - elmnt = llist[elmnt]; - goto L300; -L1000: - if (rloc <= rlmt) { - adjncy[rloc] = 0; - } -/* -------------------------------------------------------- */ -/* FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */ -/* -------------------------------------------------------- */ - link = *mdnode; -L1100: - istrt = xadj[link]; - istop = xadj[link + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - rnode = adjncy[i]; - link = -rnode; - if (rnode < 0) { - goto L1100; - } else if (rnode == 0) { - goto L1800; - } else { - goto L1200; - } -L1200: -/* -------------------------------------------- */ -/* IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */ -/* -------------------------------------------- */ - pvnode = dbakw[rnode]; - if (pvnode == 0 || pvnode == -(*maxint)) { - goto L1300; - } -/* ------------------------------------- */ -/* THEN REMOVE RNODE FROM THE STRUCTURE. */ -/* ------------------------------------- */ - nxnode = dforw[rnode]; - if (nxnode > 0) { - dbakw[nxnode] = pvnode; - } - if (pvnode > 0) { - dforw[pvnode] = nxnode; - } - npv = -pvnode; - if (pvnode < 0) { - dhead[npv] = nxnode; - } -L1300: -/* ---------------------------------------- */ -/* PURGE INACTIVE QUOTIENT NABORS OF RNODE. */ -/* ---------------------------------------- */ - jstrt = xadj[rnode]; - jstop = xadj[rnode + 1] - 1; - xqnbr = jstrt; - i__2 = jstop; - for (j = jstrt; j <= i__2; ++j) { - nabor = adjncy[j]; - if (nabor == 0) { - goto L1500; - } - if (marker[nabor] >= *tag) { - goto L1400; - } - adjncy[xqnbr] = nabor; - ++xqnbr; -L1400: - ; - } -L1500: -/* ---------------------------------------- */ -/* IF NO ACTIVE NABOR AFTER THE PURGING ... */ -/* ---------------------------------------- */ - nqnbrs = xqnbr - jstrt; - if (nqnbrs > 0) { - goto L1600; - } -/* ----------------------------- */ -/* THEN MERGE RNODE WITH MDNODE. */ -/* ----------------------------- */ - qsize[*mdnode] += qsize[rnode]; - qsize[rnode] = 0; - marker[rnode] = *maxint; - dforw[rnode] = -(*mdnode); - dbakw[rnode] = -(*maxint); - goto L1700; -L1600: -/* -------------------------------------- */ -/* ELSE FLAG RNODE FOR DEGREE UPDATE, AND */ -/* ADD MDNODE AS A NABOR OF RNODE. */ -/* -------------------------------------- */ - dforw[rnode] = nqnbrs + 1; - dbakw[rnode] = 0; - adjncy[xqnbr] = *mdnode; - ++xqnbr; - if (xqnbr <= jstop) { - adjncy[xqnbr] = 0; - } - -L1700: - ; - } -L1800: - return 0; - -} /* mmdelm_ */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* ***** MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE ***** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */ -/* AFTER A MULTIPLE ELIMINATION STEP. */ - -/* INPUT PARAMETERS - */ -/* EHEAD - THE BEGINNING OF THE LIST OF ELIMINATED */ -/* NODES (I.E., NEWLY FORMED ELEMENTS). */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ -/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ -/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */ -/* INTEGER. */ - -/* UPDATED PARAMETERS - */ -/* MDEG - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */ -/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ -/* QSIZE - SIZE OF SUPERNODE. */ -/* LLIST - WORKING LINKED LIST. */ -/* MARKER - MARKER VECTOR FOR DEGREE UPDATE. */ -/* TAG - TAG VALUE. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdupd_(int *ehead, int *neqns, int *xadj, - shortint *adjncy, int *delta, int *mdeg, shortint *dhead, - shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, - shortint *marker, int *maxint, int *tag) -{ - /* System generated locals */ - int i__1, i__2; - - /* Local variables */ - static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, - istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --marker; - --llist; - --qsize; - --dbakw; - --dforw; - --dhead; - --adjncy; - --xadj; - - /* Function Body */ - mdeg0 = *mdeg + *delta; - elmnt = *ehead; -L100: -/* ------------------------------------------------------- */ -/* FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */ -/* (RESET TAG VALUE IF NECESSARY.) */ -/* ------------------------------------------------------- */ - if (elmnt <= 0) { - return 0; - } - mtag = *tag + mdeg0; - if (mtag < *maxint) { - goto L300; - } - *tag = 1; - i__1 = *neqns; - for (i = 1; i <= i__1; ++i) { - if (marker[i] < *maxint) { - marker[i] = 0; - } -/* L200: */ - } - mtag = *tag + mdeg0; -L300: -/* --------------------------------------------- */ -/* CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */ -/* WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */ -/* ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */ -/* THAN TWO NABORS (QXHEAD). ALSO COMPUTE DEG0, */ -/* NUMBER OF NODES IN THIS ELEMENT. */ -/* --------------------------------------------- */ - q2head = 0; - qxhead = 0; - deg0 = 0; - link = elmnt; -L400: - istrt = xadj[link]; - istop = xadj[link + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - enode = adjncy[i]; - link = -enode; - if (enode < 0) { - goto L400; - } else if (enode == 0) { - goto L800; - } else { - goto L500; - } - -L500: - if (qsize[enode] == 0) { - goto L700; - } - deg0 += qsize[enode]; - marker[enode] = mtag; -/* ---------------------------------- */ -/* IF ENODE REQUIRES A DEGREE UPDATE, */ -/* THEN DO THE FOLLOWING. */ -/* ---------------------------------- */ - if (dbakw[enode] != 0) { - goto L700; - } -/* --------------------------------------- -*/ -/* PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. -*/ -/* --------------------------------------- -*/ - if (dforw[enode] == 2) { - goto L600; - } - llist[enode] = qxhead; - qxhead = enode; - goto L700; -L600: - llist[enode] = q2head; - q2head = enode; -L700: - ; - } -L800: -/* -------------------------------------------- */ -/* FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */ -/* -------------------------------------------- */ - enode = q2head; - iq2 = 1; -L900: - if (enode <= 0) { - goto L1500; - } - if (dbakw[enode] != 0) { - goto L2200; - } - ++(*tag); - deg = deg0; -/* ------------------------------------------ */ -/* IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */ -/* ------------------------------------------ */ - istrt = xadj[enode]; - nabor = adjncy[istrt]; - if (nabor == elmnt) { - nabor = adjncy[istrt + 1]; - } -/* ------------------------------------------------ */ -/* IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */ -/* ------------------------------------------------ */ - link = nabor; - if (dforw[nabor] < 0) { - goto L1000; - } - deg += qsize[nabor]; - goto L2100; -L1000: -/* -------------------------------------------- */ -/* OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */ -/* DO THE FOLLOWING. */ -/* -------------------------------------------- */ - istrt = xadj[link]; - istop = xadj[link + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - node = adjncy[i]; - link = -node; - if (node == enode) { - goto L1400; - } - if (node < 0) { - goto L1000; - } else if (node == 0) { - goto L2100; - } else { - goto L1100; - } - -L1100: - if (qsize[node] == 0) { - goto L1400; - } - if (marker[node] >= *tag) { - goto L1200; - } -/* ----------------------------------- --- */ -/* CASE WHEN NODE IS NOT YET CONSIDERED -. */ -/* ----------------------------------- --- */ - marker[node] = *tag; - deg += qsize[node]; - goto L1400; -L1200: -/* ---------------------------------------- - */ -/* CASE WHEN NODE IS INDISTINGUISHABLE FROM - */ -/* ENODE. MERGE THEM INTO A NEW SUPERNODE. - */ -/* ---------------------------------------- - */ - if (dbakw[node] != 0) { - goto L1400; - } - if (dforw[node] != 2) { - goto L1300; - } - qsize[enode] += qsize[node]; - qsize[node] = 0; - marker[node] = *maxint; - dforw[node] = -enode; - dbakw[node] = -(*maxint); - goto L1400; -L1300: -/* -------------------------------------- -*/ -/* CASE WHEN NODE IS OUTMATCHED BY ENODE. -*/ -/* -------------------------------------- -*/ - if (dbakw[node] == 0) { - dbakw[node] = -(*maxint); - } -L1400: - ; - } - goto L2100; -L1500: -/* ------------------------------------------------ */ -/* FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */ -/* ------------------------------------------------ */ - enode = qxhead; - iq2 = 0; -L1600: - if (enode <= 0) { - goto L2300; - } - if (dbakw[enode] != 0) { - goto L2200; - } - ++(*tag); - deg = deg0; -/* --------------------------------- */ -/* FOR EACH UNMARKED NABOR OF ENODE, */ -/* DO THE FOLLOWING. */ -/* --------------------------------- */ - istrt = xadj[enode]; - istop = xadj[enode + 1] - 1; - i__1 = istop; - for (i = istrt; i <= i__1; ++i) { - nabor = adjncy[i]; - if (nabor == 0) { - goto L2100; - } - if (marker[nabor] >= *tag) { - goto L2000; - } - marker[nabor] = *tag; - link = nabor; -/* ------------------------------ */ -/* IF UNELIMINATED, INCLUDE IT IN */ -/* DEG COUNT. */ -/* ------------------------------ */ - if (dforw[nabor] < 0) { - goto L1700; - } - deg += qsize[nabor]; - goto L2000; -L1700: -/* ------------------------------- -*/ -/* IF ELIMINATED, INCLUDE UNMARKED -*/ -/* NODES IN THIS ELEMENT INTO THE -*/ -/* DEGREE COUNT. */ -/* ------------------------------- -*/ - jstrt = xadj[link]; - jstop = xadj[link + 1] - 1; - i__2 = jstop; - for (j = jstrt; j <= i__2; ++j) { - node = adjncy[j]; - link = -node; - if (node < 0) { - goto L1700; - } else if (node == 0) { - goto L2000; - } else { - goto L1800; - } - -L1800: - if (marker[node] >= *tag) { - goto L1900; - } - marker[node] = *tag; - deg += qsize[node]; -L1900: - ; - } -L2000: - ; - } -L2100: -/* ------------------------------------------- */ -/* UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */ -/* STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */ -/* ------------------------------------------- */ - deg = deg - qsize[enode] + 1; - fnode = dhead[deg]; - dforw[enode] = fnode; - dbakw[enode] = -deg; - if (fnode > 0) { - dbakw[fnode] = enode; - } - dhead[deg] = enode; - if (deg < *mdeg) { - *mdeg = deg; - } -L2200: -/* ---------------------------------- */ -/* GET NEXT ENODE IN CURRENT ELEMENT. */ -/* ---------------------------------- */ - enode = llist[enode]; - if (iq2 == 1) { - goto L900; - } - goto L1600; -L2300: -/* ----------------------------- */ -/* GET NEXT ELEMENT IN THE LIST. */ -/* ----------------------------- */ - *tag = mtag; - elmnt = llist[elmnt]; - goto L100; - -} /* mmdupd_ */ - -/* *************************************************************** */ -/* *************************************************************** */ -/* ***** MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING ***** */ -/* *************************************************************** */ -/* *************************************************************** */ - -/* AUTHOR - JOSEPH W.H. LIU */ -/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ - -/* PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */ -/* PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */ -/* VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */ -/* MINIMUM DEGREE ORDERING ALGORITHM. */ - -/* INPUT PARAMETERS - */ -/* NEQNS - NUMBER OF EQUATIONS. */ -/* QSIZE - SIZE OF SUPERNODES AT ELIMINATION. */ - -/* UPDATED PARAMETERS - */ -/* INVP - INVERSE PERMUTATION VECTOR. ON INPUT, */ -/* IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */ -/* INTO THE NODE -INVP(NODE); OTHERWISE, */ -/* -INVP(NODE) IS ITS INVERSE LABELLING. */ - -/* OUTPUT PARAMETERS - */ -/* PERM - THE PERMUTATION VECTOR. */ - -/* *************************************************************** */ - -/* Subroutine */ int mmdnum_(int *neqns, shortint *perm, shortint *invp, - shortint *qsize) -{ - /* System generated locals */ - int i__1; - - /* Local variables */ - static int node, root, nextf, father, nqsize, num; - - -/* *************************************************************** */ - - -/* *************************************************************** */ - - /* Parameter adjustments */ - --qsize; - --invp; - --perm; - - /* Function Body */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - nqsize = qsize[node]; - if (nqsize <= 0) { - perm[node] = invp[node]; - } - if (nqsize > 0) { - perm[node] = -invp[node]; - } -/* L100: */ - } -/* ------------------------------------------------------ */ -/* FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */ -/* ------------------------------------------------------ */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - if (perm[node] > 0) { - goto L500; - } -/* ----------------------------------------- */ -/* TRACE THE MERGED TREE UNTIL ONE WHICH HAS */ -/* NOT BEEN MERGED, CALL IT ROOT. */ -/* ----------------------------------------- */ - father = node; -L200: - if (perm[father] > 0) { - goto L300; - } - father = -perm[father]; - goto L200; -L300: -/* ----------------------- */ -/* NUMBER NODE AFTER ROOT. */ -/* ----------------------- */ - root = father; - num = perm[root] + 1; - invp[node] = -num; - perm[root] = num; -/* ------------------------ */ -/* SHORTEN THE MERGED TREE. */ -/* ------------------------ */ - father = node; -L400: - nextf = -perm[father]; - if (nextf <= 0) { - goto L500; - } - perm[father] = -root; - father = nextf; - goto L400; -L500: - ; - } -/* ---------------------- */ -/* READY TO COMPUTE PERM. */ -/* ---------------------- */ - i__1 = *neqns; - for (node = 1; node <= i__1; ++node) { - num = -invp[node]; - invp[node] = num; - perm[num] = node; -/* L600: */ - } - return 0; - -} /* mmdnum_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,2583 +0,0 @@ -/* ========================================================================== */ -/* === colamd - a sparse matrix column ordering algorithm =================== */ -/* ========================================================================== */ - -/* - colamd: An approximate minimum degree column ordering algorithm. - - Purpose: - - Colamd computes a permutation Q such that the Cholesky factorization of - (AQ)'(AQ) has less fill-in and requires fewer floating point operations - than A'A. This also provides a good ordering for sparse partial - pivoting methods, P(AQ) = LU, where Q is computed prior to numerical - factorization, and P is computed during numerical factorization via - conventional partial pivoting with row interchanges. Colamd is the - column ordering method used in SuperLU, part of the ScaLAPACK library. - It is also available as user-contributed software for Matlab 5.2, - available from MathWorks, Inc. (http://www.mathworks.com). This - routine can be used in place of COLMMD in Matlab. By default, the \ - and / operators in Matlab perform a column ordering (using COLMMD) - prior to LU factorization using sparse partial pivoting, in the - built-in Matlab LU(A) routine. - - Authors: - - The authors of the code itself are Stefan I. Larimore and Timothy A. - Davis (davis@cise.ufl.edu), University of Florida. The algorithm was - developed in collaboration with John Gilbert, Xerox PARC, and Esmond - Ng, Oak Ridge National Laboratory. - - Date: - - August 3, 1998. Version 1.0. - - Acknowledgements: - - This work was supported by the National Science Foundation, under - grants DMS-9504974 and DMS-9803599. - - Notice: - - Copyright (c) 1998 by the University of Florida. All Rights Reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - User documentation of any code that uses this code must cite the - Authors, the Copyright, and "Used by permission." If this code is - accessible from within Matlab, then typing "help colamd" or "colamd" - (with no arguments) must cite the Authors. Permission to modify the - code and to distribute modified code is granted, provided the above - notices are retained, and a notice that the code was modified is - included with the above copyright notice. You must also retain the - Availability information below, of the original version. - - This software is provided free of charge. - - Availability: - - This file is located at - - http://www.cise.ufl.edu/~davis/colamd/colamd.c - - The colamd.h file is required, located in the same directory. - The colamdmex.c file provides a Matlab interface for colamd. - The symamdmex.c file provides a Matlab interface for symamd, which is - a symmetric ordering based on this code, colamd.c. All codes are - purely ANSI C compliant (they use no Unix-specific routines, include - files, etc.). -*/ - -/* ========================================================================== */ -/* === Description of user-callable routines ================================ */ -/* ========================================================================== */ - -/* - Each user-callable routine (declared as PUBLIC) is briefly described below. - Refer to the comments preceding each routine for more details. - - ---------------------------------------------------------------------------- - colamd_recommended: - ---------------------------------------------------------------------------- - - Usage: - - Alen = colamd_recommended (nnz, n_row, n_col) ; - - Purpose: - - Returns recommended value of Alen for use by colamd. Returns -1 - if any input argument is negative. - - Arguments: - - int nnz ; Number of nonzeros in the matrix A. This must - be the same value as p [n_col] in the call to - colamd - otherwise you will get a wrong value - of the recommended memory to use. - int n_row ; Number of rows in the matrix A. - int n_col ; Number of columns in the matrix A. - - ---------------------------------------------------------------------------- - colamd_set_defaults: - ---------------------------------------------------------------------------- - - Usage: - - colamd_set_defaults (knobs) ; - - Purpose: - - Sets the default parameters. - - Arguments: - - double knobs [COLAMD_KNOBS] ; Output only. - - Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries - are removed prior to ordering. Columns with more than - (knobs [COLAMD_DENSE_COL] * n_row) entries are removed - prior to ordering, and placed last in the output column - ordering. Default values of these two knobs are both 0.5. - Currently, only knobs [0] and knobs [1] are used, but future - versions may use more knobs. If so, they will be properly set - to their defaults by the future version of colamd_set_defaults, - so that the code that calls colamd will not need to change, - assuming that you either use colamd_set_defaults, or pass a - (double *) NULL pointer as the knobs array to colamd. - - ---------------------------------------------------------------------------- - colamd: - ---------------------------------------------------------------------------- - - Usage: - - colamd (n_row, n_col, Alen, A, p, knobs) ; - - Purpose: - - Computes a column ordering (Q) of A such that P(AQ)=LU or - (AQ)'AQ=LL' have less fill-in and require fewer floating point - operations than factorizing the unpermuted matrix A or A'A, - respectively. - - Arguments: - - int n_row ; - - Number of rows in the matrix A. - Restriction: n_row >= 0. - Colamd returns FALSE if n_row is negative. - - int n_col ; - - Number of columns in the matrix A. - Restriction: n_col >= 0. - Colamd returns FALSE if n_col is negative. - - int Alen ; - - Restriction (see note): - Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS - Colamd returns FALSE if these conditions are not met. - - Note: this restriction makes an modest assumption regarding - the size of the two typedef'd structures, below. We do, - however, guarantee that - Alen >= colamd_recommended (nnz, n_row, n_col) - will be sufficient. - - int A [Alen] ; Input argument, stats on output. - - A is an integer array of size Alen. Alen must be at least as - large as the bare minimum value given above, but this is very - low, and can result in excessive run time. For best - performance, we recommend that Alen be greater than or equal to - colamd_recommended (nnz, n_row, n_col), which adds - nnz/5 to the bare minimum value given above. - - On input, the row indices of the entries in column c of the - matrix are held in A [(p [c]) ... (p [c+1]-1)]. The row indices - in a given column c need not be in ascending order, and - duplicate row indices may be be present. However, colamd will - work a little faster if both of these conditions are met - (Colamd puts the matrix into this format, if it finds that the - the conditions are not met). - - The matrix is 0-based. That is, rows are in the range 0 to - n_row-1, and columns are in the range 0 to n_col-1. Colamd - returns FALSE if any row index is out of range. - - The contents of A are modified during ordering, and are thus - undefined on output with the exception of a few statistics - about the ordering (A [0..COLAMD_STATS-1]): - A [0]: number of dense or empty rows ignored. - A [1]: number of dense or empty columns ignored (and ordered - last in the output permutation p) - A [2]: number of garbage collections performed. - A [3]: 0, if all row indices in each column were in sorted - order, and no duplicates were present. - 1, otherwise (in which case colamd had to do more work) - Note that a row can become "empty" if it contains only - "dense" and/or "empty" columns, and similarly a column can - become "empty" if it only contains "dense" and/or "empty" rows. - Future versions may return more statistics in A, but the usage - of these 4 entries in A will remain unchanged. - - int p [n_col+1] ; Both input and output argument. - - p is an integer array of size n_col+1. On input, it holds the - "pointers" for the column form of the matrix A. Column c of - the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first - entry, p [0], must be zero, and p [c] <= p [c+1] must hold - for all c in the range 0 to n_col-1. The value p [n_col] is - thus the total number of entries in the pattern of the matrix A. - Colamd returns FALSE if these conditions are not met. - - On output, if colamd returns TRUE, the array p holds the column - permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is - the first column index in the new ordering, and p [n_col-1] is - the last. That is, p [k] = j means that column j of A is the - kth pivot column, in AQ, where k is in the range 0 to n_col-1 - (p [0] = j means that column j of A is the first column in AQ). - - If colamd returns FALSE, then no permutation is returned, and - p is undefined on output. - - double knobs [COLAMD_KNOBS] ; Input only. - - See colamd_set_defaults for a description. If the knobs array - is not present (that is, if a (double *) NULL pointer is passed - in its place), then the default values of the parameters are - used instead. - -*/ - - -/* ========================================================================== */ -/* === Include files ======================================================== */ -/* ========================================================================== */ - -/* limits.h: the largest positive integer (INT_MAX) */ -#include - -/* colamd.h: knob array size, stats output size, and global prototypes */ -#include "colamd.h" - -/* ========================================================================== */ -/* === Scaffolding code definitions ======================================== */ -/* ========================================================================== */ - -/* Ensure that debugging is turned off: */ -#ifndef NDEBUG -#define NDEBUG -#endif - -/* assert.h: the assert macro (no debugging if NDEBUG is defined) */ -#include - -/* - Our "scaffolding code" philosophy: In our opinion, well-written library - code should keep its "debugging" code, and just normally have it turned off - by the compiler so as not to interfere with performance. This serves - several purposes: - - (1) assertions act as comments to the reader, telling you what the code - expects at that point. All assertions will always be true (unless - there really is a bug, of course). - - (2) leaving in the scaffolding code assists anyone who would like to modify - the code, or understand the algorithm (by reading the debugging output, - one can get a glimpse into what the code is doing). - - (3) (gasp!) for actually finding bugs. This code has been heavily tested - and "should" be fully functional and bug-free ... but you never know... - - To enable debugging, comment out the "#define NDEBUG" above. The code will - become outrageously slow when debugging is enabled. To control the level of - debugging output, set an environment variable D to 0 (little), 1 (some), - 2, 3, or 4 (lots). -*/ - -/* ========================================================================== */ -/* === Row and Column structures ============================================ */ -/* ========================================================================== */ - -typedef struct ColInfo_struct -{ - int start ; /* index for A of first row in this column, or DEAD */ - /* if column is dead */ - int length ; /* number of rows in this column */ - union - { - int thickness ; /* number of original columns represented by this */ - /* col, if the column is alive */ - int parent ; /* parent in parent tree super-column structure, if */ - /* the column is dead */ - } shared1 ; - union - { - int score ; /* the score used to maintain heap, if col is alive */ - int order ; /* pivot ordering of this column, if col is dead */ - } shared2 ; - union - { - int headhash ; /* head of a hash bucket, if col is at the head of */ - /* a degree list */ - int hash ; /* hash value, if col is not in a degree list */ - int prev ; /* previous column in degree list, if col is in a */ - /* degree list (but not at the head of a degree list) */ - } shared3 ; - union - { - int degree_next ; /* next column, if col is in a degree list */ - int hash_next ; /* next column, if col is in a hash list */ - } shared4 ; - -} ColInfo ; - -typedef struct RowInfo_struct -{ - int start ; /* index for A of first col in this row */ - int length ; /* number of principal columns in this row */ - union - { - int degree ; /* number of principal & non-principal columns in row */ - int p ; /* used as a row pointer in init_rows_cols () */ - } shared1 ; - union - { - int mark ; /* for computing set differences and marking dead rows*/ - int first_column ;/* first column in row (used in garbage collection) */ - } shared2 ; - -} RowInfo ; - -/* ========================================================================== */ -/* === Definitions ========================================================== */ -/* ========================================================================== */ - -#define MAX(a,b) (((a) > (b)) ? (a) : (b)) -#define MIN(a,b) (((a) < (b)) ? (a) : (b)) - -#define ONES_COMPLEMENT(r) (-(r)-1) - -#define TRUE (1) -#define FALSE (0) -#define EMPTY (-1) - -/* Row and column status */ -#define ALIVE (0) -#define DEAD (-1) - -/* Column status */ -#define DEAD_PRINCIPAL (-1) -#define DEAD_NON_PRINCIPAL (-2) - -/* Macros for row and column status update and checking. */ -#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } - -/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */ -#define PUBLIC -#define PRIVATE static - -/* ========================================================================== */ -/* === Prototypes of PRIVATE routines ======================================= */ -/* ========================================================================== */ - -PRIVATE int init_rows_cols -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int p [] -) ; - -PRIVATE void init_scoring -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int head [], - double knobs [COLAMD_KNOBS], - int *p_n_row2, - int *p_n_col2, - int *p_max_deg -) ; - -PRIVATE int find_ordering -( - int n_row, - int n_col, - int Alen, - RowInfo Row [], - ColInfo Col [], - int A [], - int head [], - int n_col2, - int max_deg, - int pfree -) ; - -PRIVATE void order_children -( - int n_col, - ColInfo Col [], - int p [] -) ; - -PRIVATE void detect_super_cols -( -#ifndef NDEBUG - int n_col, - RowInfo Row [], -#endif - ColInfo Col [], - int A [], - int head [], - int row_start, - int row_length -) ; - -PRIVATE int garbage_collection -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int *pfree -) ; - -PRIVATE int clear_mark -( - int n_row, - RowInfo Row [] -) ; - -/* ========================================================================== */ -/* === Debugging definitions ================================================ */ -/* ========================================================================== */ - -#ifndef NDEBUG - -/* === With debugging ======================================================= */ - -/* stdlib.h: for getenv and atoi, to get debugging level from environment */ -#include - -/* stdio.h: for printf (no printing if debugging is turned off) */ -#include - -PRIVATE void debug_deg_lists -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int head [], - int min_score, - int should, - int max_deg -) ; - -PRIVATE void debug_mark -( - int n_row, - RowInfo Row [], - int tag_mark, - int max_mark -) ; - -PRIVATE void debug_matrix -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [] -) ; - -PRIVATE void debug_structures -( - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int n_col2 -) ; - -/* the following is the *ONLY* global variable in this file, and is only */ -/* present when debugging */ - -PRIVATE int debug_colamd ; /* debug print level */ - -#define DEBUG0(params) { (void) printf params ; } -#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; } -#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; } -#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; } -#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; } - -#else - -/* === No debugging ========================================================= */ - -#define DEBUG0(params) ; -#define DEBUG1(params) ; -#define DEBUG2(params) ; -#define DEBUG3(params) ; -#define DEBUG4(params) ; - -#endif - -/* ========================================================================== */ - - -/* ========================================================================== */ -/* === USER-CALLABLE ROUTINES: ============================================== */ -/* ========================================================================== */ - - -/* ========================================================================== */ -/* === colamd_recommended =================================================== */ -/* ========================================================================== */ - -/* - The colamd_recommended routine returns the suggested size for Alen. This - value has been determined to provide good balance between the number of - garbage collections and the memory requirements for colamd. -*/ - -PUBLIC int colamd_recommended /* returns recommended value of Alen. */ -( - /* === Parameters ======================================================= */ - - int nnz, /* number of nonzeros in A */ - int n_row, /* number of rows in A */ - int n_col /* number of columns in A */ -) -{ - /* === Local variables ================================================== */ - - int minimum ; /* bare minimum requirements */ - int recommended ; /* recommended value of Alen */ - - if (nnz < 0 || n_row < 0 || n_col < 0) - { - /* return -1 if any input argument is corrupted */ - DEBUG0 (("colamd_recommended error!")) ; - DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ; - return (-1) ; - } - - minimum = - 2 * (nnz) /* for A */ - + (((n_col) + 1) * sizeof (ColInfo) / sizeof (int)) /* for Col */ - + (((n_row) + 1) * sizeof (RowInfo) / sizeof (int)) /* for Row */ - + n_col /* minimum elbow room to guarrantee success */ - + COLAMD_STATS ; /* for output statistics */ - - /* recommended is equal to the minumum plus enough memory to keep the */ - /* number garbage collections low */ - recommended = minimum + nnz/5 ; - - return (recommended) ; -} - - -/* ========================================================================== */ -/* === colamd_set_defaults ================================================== */ -/* ========================================================================== */ - -/* - The colamd_set_defaults routine sets the default values of the user- - controllable parameters for colamd: - - knobs [0] rows with knobs[0]*n_col entries or more are removed - prior to ordering. - - knobs [1] columns with knobs[1]*n_row entries or more are removed - prior to ordering, and placed last in the column - permutation. - - knobs [2..19] unused, but future versions might use this -*/ - -PUBLIC void colamd_set_defaults -( - /* === Parameters ======================================================= */ - - double knobs [COLAMD_KNOBS] /* knob array */ -) -{ - /* === Local variables ================================================== */ - - int i ; - - if (!knobs) - { - return ; /* no knobs to initialize */ - } - for (i = 0 ; i < COLAMD_KNOBS ; i++) - { - knobs [i] = 0 ; - } - knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ - knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ -} - - -/* ========================================================================== */ -/* === colamd =============================================================== */ -/* ========================================================================== */ - -/* - The colamd routine computes a column ordering Q of a sparse matrix - A such that the LU factorization P(AQ) = LU remains sparse, where P is - selected via partial pivoting. The routine can also be viewed as - providing a permutation Q such that the Cholesky factorization - (AQ)'(AQ) = LL' remains sparse. - - On input, the nonzero patterns of the columns of A are stored in the - array A, in order 0 to n_col-1. A is held in 0-based form (rows in the - range 0 to n_row-1 and columns in the range 0 to n_col-1). Row indices - for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0, - and thus p [n_col] is the number of entries in A. The matrix is - destroyed on output. The row indices within each column do not have to - be sorted (from small to large row indices), and duplicate row indices - may be present. However, colamd will work a little faster if columns are - sorted and no duplicates are present. Matlab 5.2 always passes the matrix - with sorted columns, and no duplicates. - - The integer array A is of size Alen. Alen must be at least of size - (where nnz is the number of entries in A): - - nnz for the input column form of A - + nnz for a row form of A that colamd generates - + 6*(n_col+1) for a ColInfo Col [0..n_col] array - (this assumes sizeof (ColInfo) is 6 int's). - + 4*(n_row+1) for a RowInfo Row [0..n_row] array - (this assumes sizeof (RowInfo) is 4 int's). - + elbow_room must be at least n_col. We recommend at least - nnz/5 in addition to that. If sufficient, - changes in the elbow room affect the ordering - time only, not the ordering itself. - + COLAMD_STATS for the output statistics - - Colamd returns FALSE is memory is insufficient, or TRUE otherwise. - - On input, the caller must specify: - - n_row the number of rows of A - n_col the number of columns of A - Alen the size of the array A - A [0 ... nnz-1] the row indices, where nnz = p [n_col] - A [nnz ... Alen-1] (need not be initialized by the user) - p [0 ... n_col] the column pointers, p [0] = 0, and p [n_col] - is the number of entries in A. Column c of A - is stored in A [p [c] ... p [c+1]-1]. - knobs [0 ... 19] a set of parameters that control the behavior - of colamd. If knobs is a NULL pointer the - defaults are used. The user-callable - colamd_set_defaults routine sets the default - parameters. See that routine for a description - of the user-controllable parameters. - - If the return value of Colamd is TRUE, then on output: - - p [0 ... n_col-1] the column permutation. p [0] is the first - column index, and p [n_col-1] is the last. - That is, p [k] = j means that column j of A - is the kth column of AQ. - - A is undefined on output (the matrix pattern is - destroyed), except for the following statistics: - - A [0] the number of dense (or empty) rows ignored - A [1] the number of dense (or empty) columms. These - are ordered last, in their natural order. - A [2] the number of garbage collections performed. - If this is excessive, then you would have - gotten your results faster if Alen was larger. - A [3] 0, if all row indices in each column were in - sorted order and no duplicates were present. - 1, if there were unsorted or duplicate row - indices in the input. You would have gotten - your results faster if A [3] was returned as 0. - - If the return value of Colamd is FALSE, then A and p are undefined on - output. -*/ - -PUBLIC int colamd /* returns TRUE if successful */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows in A */ - int n_col, /* number of columns in A */ - int Alen, /* length of A */ - int A [], /* row indices of A */ - int p [], /* pointers to columns in A */ - double knobs [COLAMD_KNOBS] /* parameters (uses defaults if NULL) */ -) -{ - /* === Local variables ================================================== */ - - int i ; /* loop index */ - int nnz ; /* nonzeros in A */ - int Row_size ; /* size of Row [], in integers */ - int Col_size ; /* size of Col [], in integers */ - int elbow_room ; /* remaining free space */ - RowInfo *Row ; /* pointer into A of Row [0..n_row] array */ - ColInfo *Col ; /* pointer into A of Col [0..n_col] array */ - int n_col2 ; /* number of non-dense, non-empty columns */ - int n_row2 ; /* number of non-dense, non-empty rows */ - int ngarbage ; /* number of garbage collections performed */ - int max_deg ; /* maximum row degree */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs knobs array */ - int init_result ; /* return code from initialization */ - -#ifndef NDEBUG - debug_colamd = 0 ; /* no debug printing */ - /* get "D" environment variable, which gives the debug printing level */ - if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ; - DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ; -#endif - - /* === Check the input arguments ======================================== */ - - if (n_row < 0 || n_col < 0 || !A || !p) - { - /* n_row and n_col must be non-negative, A and p must be present */ - DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ; - return (FALSE) ; - } - nnz = p [n_col] ; - if (nnz < 0 || p [0] != 0) - { - /* nnz must be non-negative, and p [0] must be zero */ - DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ; - return (FALSE) ; - } - - /* === If no knobs, set default parameters ============================== */ - - if (!knobs) - { - knobs = default_knobs ; - colamd_set_defaults (knobs) ; - } - - /* === Allocate the Row and Col arrays from array A ===================== */ - - Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ; - Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ; - elbow_room = Alen - (2*nnz + Col_size + Row_size) ; - if (elbow_room < n_col + COLAMD_STATS) - { - /* not enough space in array A to perform the ordering */ - DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ; - return (FALSE) ; - } - Alen = 2*nnz + elbow_room ; - Col = (ColInfo *) &A [Alen] ; - Row = (RowInfo *) &A [Alen + Col_size] ; - - /* === Construct the row and column data structures ===================== */ - - init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ; - if (init_result == -1) - { - /* input matrix is invalid */ - DEBUG0 (("colamd error! matrix invalid\n")) ; - return (FALSE) ; - } - - /* === Initialize scores, kill dense rows/columns ======================= */ - - init_scoring (n_row, n_col, Row, Col, A, p, knobs, - &n_row2, &n_col2, &max_deg) ; - - /* === Order the supercolumns =========================================== */ - - ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p, - n_col2, max_deg, 2*nnz) ; - - /* === Order the non-principal columns ================================== */ - - order_children (n_col, Col, p) ; - - /* === Return statistics in A =========================================== */ - - for (i = 0 ; i < COLAMD_STATS ; i++) - { - A [i] = 0 ; - } - A [COLAMD_DENSE_ROW] = n_row - n_row2 ; - A [COLAMD_DENSE_COL] = n_col - n_col2 ; - A [COLAMD_DEFRAG_COUNT] = ngarbage ; - A [COLAMD_JUMBLED_COLS] = init_result ; - - return (TRUE) ; -} - - -/* ========================================================================== */ -/* === NON-USER-CALLABLE ROUTINES: ========================================== */ -/* ========================================================================== */ - -/* There are no user-callable routines beyond this point in the file */ - - -/* ========================================================================== */ -/* === init_rows_cols ======================================================= */ -/* ========================================================================== */ - -/* - Takes the column form of the matrix in A and creates the row form of the - matrix. Also, row and column attributes are stored in the Col and Row - structs. If the columns are un-sorted or contain duplicate row indices, - this routine will also sort and remove duplicate row indices from the - column form of the matrix. Returns -1 on error, 1 if columns jumbled, - or 0 if columns not jumbled. Not user-callable. -*/ - -PRIVATE int init_rows_cols /* returns status code */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - RowInfo Row [], /* of size n_row+1 */ - ColInfo Col [], /* of size n_col+1 */ - int A [], /* row indices of A, of size Alen */ - int p [] /* pointers to columns in A, of size n_col+1 */ -) -{ - /* === Local variables ================================================== */ - - int col ; /* a column index */ - int row ; /* a row index */ - int *cp ; /* a column pointer */ - int *cp_end ; /* a pointer to the end of a column */ - int *rp ; /* a row pointer */ - int *rp_end ; /* a pointer to the end of a row */ - int last_start ; /* start index of previous column in A */ - int start ; /* start index of column in A */ - int last_row ; /* previous row */ - int jumbled_columns ; /* indicates if columns are jumbled */ - - /* === Initialize columns, and check column pointers ==================== */ - - last_start = 0 ; - for (col = 0 ; col < n_col ; col++) - { - start = p [col] ; - if (start < last_start) - { - /* column pointers must be non-decreasing */ - DEBUG0 (("colamd error! last p %d p [col] %d\n",last_start,start)); - return (-1) ; - } - Col [col].start = start ; - Col [col].length = p [col+1] - start ; - Col [col].shared1.thickness = 1 ; - Col [col].shared2.score = 0 ; - Col [col].shared3.prev = EMPTY ; - Col [col].shared4.degree_next = EMPTY ; - last_start = start ; - } - /* must check the end pointer for last column */ - if (p [n_col] < last_start) - { - /* column pointers must be non-decreasing */ - DEBUG0 (("colamd error! last p %d p [n_col] %d\n",p[col],last_start)) ; - return (-1) ; - } - - /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ - - /* === Scan columns, compute row degrees, and check row indices ========= */ - - jumbled_columns = FALSE ; - - for (row = 0 ; row < n_row ; row++) - { - Row [row].length = 0 ; - Row [row].shared2.mark = -1 ; - } - - for (col = 0 ; col < n_col ; col++) - { - last_row = -1 ; - - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - - while (cp < cp_end) - { - row = *cp++ ; - - /* make sure row indices within range */ - if (row < 0 || row >= n_row) - { - DEBUG0 (("colamd error! col %d row %d last_row %d\n", - col, row, last_row)) ; - return (-1) ; - } - else if (row <= last_row) - { - /* row indices are not sorted or repeated, thus cols */ - /* are jumbled */ - jumbled_columns = TRUE ; - } - /* prevent repeated row from being counted */ - if (Row [row].shared2.mark != col) - { - Row [row].length++ ; - Row [row].shared2.mark = col ; - last_row = row ; - } - else - { - /* this is a repeated entry in the column, */ - /* it will be removed */ - Col [col].length-- ; - } - } - } - - /* === Compute row pointers ============================================= */ - - /* row form of the matrix starts directly after the column */ - /* form of matrix in A */ - Row [0].start = p [n_col] ; - Row [0].shared1.p = Row [0].start ; - Row [0].shared2.mark = -1 ; - for (row = 1 ; row < n_row ; row++) - { - Row [row].start = Row [row-1].start + Row [row-1].length ; - Row [row].shared1.p = Row [row].start ; - Row [row].shared2.mark = -1 ; - } - - /* === Create row form ================================================== */ - - if (jumbled_columns) - { - /* if cols jumbled, watch for repeated row indices */ - for (col = 0 ; col < n_col ; col++) - { - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - while (cp < cp_end) - { - row = *cp++ ; - if (Row [row].shared2.mark != col) - { - A [(Row [row].shared1.p)++] = col ; - Row [row].shared2.mark = col ; - } - } - } - } - else - { - /* if cols not jumbled, we don't need the mark (this is faster) */ - for (col = 0 ; col < n_col ; col++) - { - cp = &A [p [col]] ; - cp_end = &A [p [col+1]] ; - while (cp < cp_end) - { - A [(Row [*cp++].shared1.p)++] = col ; - } - } - } - - /* === Clear the row marks and set row degrees ========================== */ - - for (row = 0 ; row < n_row ; row++) - { - Row [row].shared2.mark = 0 ; - Row [row].shared1.degree = Row [row].length ; - } - - /* === See if we need to re-create columns ============================== */ - - if (jumbled_columns) - { - -#ifndef NDEBUG - /* make sure column lengths are correct */ - for (col = 0 ; col < n_col ; col++) - { - p [col] = Col [col].length ; - } - for (row = 0 ; row < n_row ; row++) - { - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - p [*rp++]-- ; - } - } - for (col = 0 ; col < n_col ; col++) - { - assert (p [col] == 0) ; - } - /* now p is all zero (different than when debugging is turned off) */ -#endif - - /* === Compute col pointers ========================================= */ - - /* col form of the matrix starts at A [0]. */ - /* Note, we may have a gap between the col form and the row */ - /* form if there were duplicate entries, if so, it will be */ - /* removed upon the first garbage collection */ - Col [0].start = 0 ; - p [0] = Col [0].start ; - for (col = 1 ; col < n_col ; col++) - { - /* note that the lengths here are for pruned columns, i.e. */ - /* no duplicate row indices will exist for these columns */ - Col [col].start = Col [col-1].start + Col [col-1].length ; - p [col] = Col [col].start ; - } - - /* === Re-create col form =========================================== */ - - for (row = 0 ; row < n_row ; row++) - { - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - A [(p [*rp++])++] = row ; - } - } - return (1) ; - } - else - { - /* no columns jumbled (this is faster) */ - return (0) ; - } -} - - -/* ========================================================================== */ -/* === init_scoring ========================================================= */ -/* ========================================================================== */ - -/* - Kills dense or empty columns and rows, calculates an initial score for - each column, and places all columns in the degree lists. Not user-callable. -*/ - -PRIVATE void init_scoring -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - RowInfo Row [], /* of size n_row+1 */ - ColInfo Col [], /* of size n_col+1 */ - int A [], /* column form and row form of A */ - int head [], /* of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameters */ - int *p_n_row2, /* number of non-dense, non-empty rows */ - int *p_n_col2, /* number of non-dense, non-empty columns */ - int *p_max_deg /* maximum row degree */ -) -{ - /* === Local variables ================================================== */ - - int c ; /* a column index */ - int r, row ; /* a row index */ - int *cp ; /* a column pointer */ - int deg ; /* degree (# entries) of a row or column */ - int *cp_end ; /* a pointer to the end of a column */ - int *new_cp ; /* new column pointer */ - int col_length ; /* length of pruned column */ - int score ; /* current column score */ - int n_col2 ; /* number of non-dense, non-empty columns */ - int n_row2 ; /* number of non-dense, non-empty rows */ - int dense_row_count ; /* remove rows with more entries than this */ - int dense_col_count ; /* remove cols with more entries than this */ - int min_score ; /* smallest column score */ - int max_deg ; /* maximum row degree */ - int next_col ; /* Used to add to degree list.*/ -#ifndef NDEBUG - int debug_count ; /* debug only. */ -#endif - - /* === Extract knobs ==================================================== */ - - dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; - dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; - DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ; - max_deg = 0 ; - n_col2 = n_col ; - n_row2 = n_row ; - - /* === Kill empty columns =============================================== */ - - /* Put the empty columns at the end in their natural, so that LU */ - /* factorization can proceed as far as possible. */ - for (c = n_col-1 ; c >= 0 ; c--) - { - deg = Col [c].length ; - if (deg == 0) - { - /* this is a empty column, kill and order it last */ - Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; - } - } - DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ; - - /* === Kill dense columns =============================================== */ - - /* Put the dense columns at the end, in their natural order */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* skip any dead columns */ - if (COL_IS_DEAD (c)) - { - continue ; - } - deg = Col [c].length ; - if (deg > dense_col_count) - { - /* this is a dense column, kill and order it last */ - Col [c].shared2.order = --n_col2 ; - /* decrement the row degrees */ - cp = &A [Col [c].start] ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - Row [*cp++].shared1.degree-- ; - } - KILL_PRINCIPAL_COL (c) ; - } - } - DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ; - - /* === Kill dense and empty rows ======================================== */ - - for (r = 0 ; r < n_row ; r++) - { - deg = Row [r].shared1.degree ; - assert (deg >= 0 && deg <= n_col) ; - if (deg > dense_row_count || deg == 0) - { - /* kill a dense or empty row */ - KILL_ROW (r) ; - --n_row2 ; - } - else - { - /* keep track of max degree of remaining rows */ - max_deg = MAX (max_deg, deg) ; - } - } - DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ; - - /* === Compute initial column scores ==================================== */ - - /* At this point the row degrees are accurate. They reflect the number */ - /* of "live" (non-dense) columns in each row. No empty rows exist. */ - /* Some "live" columns may contain only dead rows, however. These are */ - /* pruned in the code below. */ - - /* now find the initial matlab score for each column */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* skip dead column */ - if (COL_IS_DEAD (c)) - { - continue ; - } - score = 0 ; - cp = &A [Col [c].start] ; - new_cp = cp ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - /* skip if dead */ - if (ROW_IS_DEAD (row)) - { - continue ; - } - /* compact the column */ - *new_cp++ = row ; - /* add row's external degree */ - score += Row [row].shared1.degree - 1 ; - /* guard against integer overflow */ - score = MIN (score, n_col) ; - } - /* determine pruned column length */ - col_length = (int) (new_cp - &A [Col [c].start]) ; - if (col_length == 0) - { - /* a newly-made null column (all rows in this col are "dense" */ - /* and have already been killed) */ - DEBUG0 (("Newly null killed: %d\n", c)) ; - Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; - } - else - { - /* set column length and set score */ - assert (score >= 0) ; - assert (score <= n_col) ; - Col [c].length = col_length ; - Col [c].shared2.score = score ; - } - } - DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ; - - /* At this point, all empty rows and columns are dead. All live columns */ - /* are "clean" (containing no dead rows) and simplicial (no supercolumns */ - /* yet). Rows may contain dead columns, but all live rows contain at */ - /* least one live column. */ - -#ifndef NDEBUG - debug_structures (n_row, n_col, Row, Col, A, n_col2) ; -#endif - - /* === Initialize degree lists ========================================== */ - -#ifndef NDEBUG - debug_count = 0 ; -#endif - - /* clear the hash buckets */ - for (c = 0 ; c <= n_col ; c++) - { - head [c] = EMPTY ; - } - min_score = n_col ; - /* place in reverse order, so low column indices are at the front */ - /* of the lists. This is to encourage natural tie-breaking */ - for (c = n_col-1 ; c >= 0 ; c--) - { - /* only add principal columns to degree lists */ - if (COL_IS_ALIVE (c)) - { - DEBUG4 (("place %d score %d minscore %d ncol %d\n", - c, Col [c].shared2.score, min_score, n_col)) ; - - /* === Add columns score to DList =============================== */ - - score = Col [c].shared2.score ; - - assert (min_score >= 0) ; - assert (min_score <= n_col) ; - assert (score >= 0) ; - assert (score <= n_col) ; - assert (head [score] >= EMPTY) ; - - /* now add this column to dList at proper score location */ - next_col = head [score] ; - Col [c].shared3.prev = EMPTY ; - Col [c].shared4.degree_next = next_col ; - - /* if there already was a column with the same score, set its */ - /* previous pointer to this new column */ - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = c ; - } - head [score] = c ; - - /* see if this score is less than current min */ - min_score = MIN (min_score, score) ; - -#ifndef NDEBUG - debug_count++ ; -#endif - } - } - -#ifndef NDEBUG - DEBUG0 (("Live cols %d out of %d, non-princ: %d\n", - debug_count, n_col, n_col-debug_count)) ; - assert (debug_count == n_col2) ; - debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ; -#endif - - /* === Return number of remaining columns, and max row degree =========== */ - - *p_n_col2 = n_col2 ; - *p_n_row2 = n_row2 ; - *p_max_deg = max_deg ; -} - - -/* ========================================================================== */ -/* === find_ordering ======================================================== */ -/* ========================================================================== */ - -/* - Order the principal columns of the supercolumn form of the matrix - (no supercolumns on input). Uses a minimum approximate column minimum - degree ordering method. Not user-callable. -*/ - -PRIVATE int find_ordering /* return the number of garbage collections */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows of A */ - int n_col, /* number of columns of A */ - int Alen, /* size of A, 2*nnz + elbow_room or larger */ - RowInfo Row [], /* of size n_row+1 */ - ColInfo Col [], /* of size n_col+1 */ - int A [], /* column form and row form of A */ - int head [], /* of size n_col+1 */ - int n_col2, /* Remaining columns to order */ - int max_deg, /* Maximum row degree */ - int pfree /* index of first free slot (2*nnz on entry) */ -) -{ - /* === Local variables ================================================== */ - - int k ; /* current pivot ordering step */ - int pivot_col ; /* current pivot column */ - int *cp ; /* a column pointer */ - int *rp ; /* a row pointer */ - int pivot_row ; /* current pivot row */ - int *new_cp ; /* modified column pointer */ - int *new_rp ; /* modified row pointer */ - int pivot_row_start ; /* pointer to start of pivot row */ - int pivot_row_degree ; /* # of columns in pivot row */ - int pivot_row_length ; /* # of supercolumns in pivot row */ - int pivot_col_score ; /* score of pivot column */ - int needed_memory ; /* free space needed for pivot row */ - int *cp_end ; /* pointer to the end of a column */ - int *rp_end ; /* pointer to the end of a row */ - int row ; /* a row index */ - int col ; /* a column index */ - int max_score ; /* maximum possible score */ - int cur_score ; /* score of current column */ - unsigned int hash ; /* hash value for supernode detection */ - int head_column ; /* head of hash bucket */ - int first_col ; /* first column in hash bucket */ - int tag_mark ; /* marker value for mark array */ - int row_mark ; /* Row [row].shared2.mark */ - int set_difference ; /* set difference size of row with pivot row */ - int min_score ; /* smallest column score */ - int col_thickness ; /* "thickness" (# of columns in a supercol) */ - int max_mark ; /* maximum value of tag_mark */ - int pivot_col_thickness ; /* number of columns represented by pivot col */ - int prev_col ; /* Used by Dlist operations. */ - int next_col ; /* Used by Dlist operations. */ - int ngarbage ; /* number of garbage collections performed */ -#ifndef NDEBUG - int debug_d ; /* debug loop counter */ - int debug_step = 0 ; /* debug loop counter */ -#endif - - /* === Initialization and clear mark ==================================== */ - - max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = clear_mark (n_row, Row) ; - min_score = 0 ; - ngarbage = 0 ; - DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ; - - /* === Order the columns ================================================ */ - - for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */) - { - -#ifndef NDEBUG - if (debug_step % 100 == 0) - { - DEBUG0 (("\n... Step k: %d out of n_col2: %d\n", k, n_col2)) ; - } - else - { - DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ; - } - debug_step++ ; - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k, max_deg) ; - debug_matrix (n_row, n_col, Row, Col, A) ; -#endif - - /* === Select pivot column, and order it ============================ */ - - /* make sure degree list isn't empty */ - assert (min_score >= 0) ; - assert (min_score <= n_col) ; - assert (head [min_score] >= EMPTY) ; - -#ifndef NDEBUG - for (debug_d = 0 ; debug_d < min_score ; debug_d++) - { - assert (head [debug_d] == EMPTY) ; - } -#endif - - /* get pivot column from head of minimum degree list */ - while (head [min_score] == EMPTY && min_score < n_col) - { - min_score++ ; - } - pivot_col = head [min_score] ; - assert (pivot_col >= 0 && pivot_col <= n_col) ; - next_col = Col [pivot_col].shared4.degree_next ; - head [min_score] = next_col ; - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = EMPTY ; - } - - assert (COL_IS_ALIVE (pivot_col)) ; - DEBUG3 (("Pivot col: %d\n", pivot_col)) ; - - /* remember score for defrag check */ - pivot_col_score = Col [pivot_col].shared2.score ; - - /* the pivot column is the kth column in the pivot order */ - Col [pivot_col].shared2.order = k ; - - /* increment order count by column thickness */ - pivot_col_thickness = Col [pivot_col].shared1.thickness ; - k += pivot_col_thickness ; - assert (pivot_col_thickness > 0) ; - - /* === Garbage_collection, if necessary ============================= */ - - needed_memory = MIN (pivot_col_score, n_col - k) ; - if (pfree + needed_memory >= Alen) - { - pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; - ngarbage++ ; - /* after garbage collection we will have enough */ - assert (pfree + needed_memory < Alen) ; - /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = clear_mark (n_row, Row) ; -#ifndef NDEBUG - debug_matrix (n_row, n_col, Row, Col, A) ; -#endif - } - - /* === Compute pivot row pattern ==================================== */ - - /* get starting location for this new merged row */ - pivot_row_start = pfree ; - - /* initialize new row counts to zero */ - pivot_row_degree = 0 ; - - /* tag pivot column as having been visited so it isn't included */ - /* in merged pivot row */ - Col [pivot_col].shared1.thickness = -pivot_col_thickness ; - - /* pivot row is the union of all rows in the pivot column pattern */ - cp = &A [Col [pivot_col].start] ; - cp_end = cp + Col [pivot_col].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; - /* skip if row is dead */ - if (ROW_IS_DEAD (row)) - { - continue ; - } - rp = &A [Row [row].start] ; - rp_end = rp + Row [row].length ; - while (rp < rp_end) - { - /* get a column */ - col = *rp++ ; - /* add the column, if alive and untagged */ - col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && COL_IS_ALIVE (col)) - { - /* tag column in pivot row */ - Col [col].shared1.thickness = -col_thickness ; - assert (pfree < Alen) ; - /* place column in pivot row */ - A [pfree++] = col ; - pivot_row_degree += col_thickness ; - } - } - } - - /* clear tag on pivot column */ - Col [pivot_col].shared1.thickness = pivot_col_thickness ; - max_deg = MAX (max_deg, pivot_row_degree) ; - -#ifndef NDEBUG - DEBUG3 (("check2\n")) ; - debug_mark (n_row, Row, tag_mark, max_mark) ; -#endif - - /* === Kill all rows used to construct pivot row ==================== */ - - /* also kill pivot row, temporarily */ - cp = &A [Col [pivot_col].start] ; - cp_end = cp + Col [pivot_col].length ; - while (cp < cp_end) - { - /* may be killing an already dead row */ - row = *cp++ ; - DEBUG2 (("Kill row in pivot col: %d\n", row)) ; - KILL_ROW (row) ; - } - - /* === Select a row index to use as the new pivot row =============== */ - - pivot_row_length = pfree - pivot_row_start ; - if (pivot_row_length > 0) - { - /* pick the "pivot" row arbitrarily (first row in col) */ - pivot_row = A [Col [pivot_col].start] ; - DEBUG2 (("Pivotal row is %d\n", pivot_row)) ; - } - else - { - /* there is no pivot row, since it is of zero length */ - pivot_row = EMPTY ; - assert (pivot_row_length == 0) ; - } - assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ; - - /* === Approximate degree computation =============================== */ - - /* Here begins the computation of the approximate degree. The column */ - /* score is the sum of the pivot row "length", plus the size of the */ - /* set differences of each row in the column minus the pattern of the */ - /* pivot row itself. The column ("thickness") itself is also */ - /* excluded from the column score (we thus use an approximate */ - /* external degree). */ - - /* The time taken by the following code (compute set differences, and */ - /* add them up) is proportional to the size of the data structure */ - /* being scanned - that is, the sum of the sizes of each column in */ - /* the pivot row. Thus, the amortized time to compute a column score */ - /* is proportional to the size of that column (where size, in this */ - /* context, is the column "length", or the number of row indices */ - /* in that column). The number of row indices in a column is */ - /* monotonically non-decreasing, from the length of the original */ - /* column on input to colamd. */ - - /* === Compute set differences ====================================== */ - - DEBUG1 (("** Computing set differences phase. **\n")) ; - - /* pivot row is currently dead - it will be revived later. */ - - DEBUG2 (("Pivot row: ")) ; - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - col = *rp++ ; - assert (COL_IS_ALIVE (col) && col != pivot_col) ; - DEBUG2 (("Col: %d\n", col)) ; - - /* clear tags used to construct pivot row pattern */ - col_thickness = -Col [col].shared1.thickness ; - assert (col_thickness > 0) ; - Col [col].shared1.thickness = col_thickness ; - - /* === Remove column from degree list =========================== */ - - cur_score = Col [col].shared2.score ; - prev_col = Col [col].shared3.prev ; - next_col = Col [col].shared4.degree_next ; - assert (cur_score >= 0) ; - assert (cur_score <= n_col) ; - assert (cur_score >= EMPTY) ; - if (prev_col == EMPTY) - { - head [cur_score] = next_col ; - } - else - { - Col [prev_col].shared4.degree_next = next_col ; - } - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = prev_col ; - } - - /* === Scan the column ========================================== */ - - cp = &A [Col [col].start] ; - cp_end = cp + Col [col].length ; - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - row_mark = Row [row].shared2.mark ; - /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) - { - continue ; - } - assert (row != pivot_row) ; - set_difference = row_mark - tag_mark ; - /* check if the row has been seen yet */ - if (set_difference < 0) - { - assert (Row [row].shared1.degree <= max_deg) ; - set_difference = Row [row].shared1.degree ; - } - /* subtract column thickness from this row's set difference */ - set_difference -= col_thickness ; - assert (set_difference >= 0) ; - /* absorb this row if the set difference becomes zero */ - if (set_difference == 0) - { - DEBUG1 (("aggressive absorption. Row: %d\n", row)) ; - KILL_ROW (row) ; - } - else - { - /* save the new mark */ - Row [row].shared2.mark = set_difference + tag_mark ; - } - } - } - -#ifndef NDEBUG - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k-pivot_row_degree, max_deg) ; -#endif - - /* === Add up set differences for each column ======================= */ - - DEBUG1 (("** Adding set differences phase. **\n")) ; - - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - /* get a column */ - col = *rp++ ; - assert (COL_IS_ALIVE (col) && col != pivot_col) ; - hash = 0 ; - cur_score = 0 ; - cp = &A [Col [col].start] ; - /* compact the column */ - new_cp = cp ; - cp_end = cp + Col [col].length ; - - DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ; - - while (cp < cp_end) - { - /* get a row */ - row = *cp++ ; - assert(row >= 0 && row < n_row) ; - row_mark = Row [row].shared2.mark ; - /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) - { - continue ; - } - assert (row_mark > tag_mark) ; - /* compact the column */ - *new_cp++ = row ; - /* compute hash function */ - hash += row ; - /* add set difference */ - cur_score += row_mark - tag_mark ; - /* integer overflow... */ - cur_score = MIN (cur_score, n_col) ; - } - - /* recompute the column's length */ - Col [col].length = (int) (new_cp - &A [Col [col].start]) ; - - /* === Further mass elimination ================================= */ - - if (Col [col].length == 0) - { - DEBUG1 (("further mass elimination. Col: %d\n", col)) ; - /* nothing left but the pivot row in this column */ - KILL_PRINCIPAL_COL (col) ; - pivot_row_degree -= Col [col].shared1.thickness ; - assert (pivot_row_degree >= 0) ; - /* order it */ - Col [col].shared2.order = k ; - /* increment order count by column thickness */ - k += Col [col].shared1.thickness ; - } - else - { - /* === Prepare for supercolumn detection ==================== */ - - DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ; - - /* save score so far */ - Col [col].shared2.score = cur_score ; - - /* add column to hash table, for supercolumn detection */ - hash %= n_col + 1 ; - - DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ; - assert (hash <= n_col) ; - - head_column = head [hash] ; - if (head_column > EMPTY) - { - /* degree list "hash" is non-empty, use prev (shared3) of */ - /* first column in degree list as head of hash bucket */ - first_col = Col [head_column].shared3.headhash ; - Col [head_column].shared3.headhash = col ; - } - else - { - /* degree list "hash" is empty, use head as hash bucket */ - first_col = - (head_column + 2) ; - head [hash] = - (col + 2) ; - } - Col [col].shared4.hash_next = first_col ; - - /* save hash function in Col [col].shared3.hash */ - Col [col].shared3.hash = (int) hash ; - assert (COL_IS_ALIVE (col)) ; - } - } - - /* The approximate external column degree is now computed. */ - - /* === Supercolumn detection ======================================== */ - - DEBUG1 (("** Supercolumn detection phase. **\n")) ; - - detect_super_cols ( -#ifndef NDEBUG - n_col, Row, -#endif - Col, A, head, pivot_row_start, pivot_row_length) ; - - /* === Kill the pivotal column ====================================== */ - - KILL_PRINCIPAL_COL (pivot_col) ; - - /* === Clear mark =================================================== */ - - tag_mark += (max_deg + 1) ; - if (tag_mark >= max_mark) - { - DEBUG1 (("clearing tag_mark\n")) ; - tag_mark = clear_mark (n_row, Row) ; - } -#ifndef NDEBUG - DEBUG3 (("check3\n")) ; - debug_mark (n_row, Row, tag_mark, max_mark) ; -#endif - - /* === Finalize the new pivot row, and column scores ================ */ - - DEBUG1 (("** Finalize scores phase. **\n")) ; - - /* for each column in pivot row */ - rp = &A [pivot_row_start] ; - /* compact the pivot row */ - new_rp = rp ; - rp_end = rp + pivot_row_length ; - while (rp < rp_end) - { - col = *rp++ ; - /* skip dead columns */ - if (COL_IS_DEAD (col)) - { - continue ; - } - *new_rp++ = col ; - /* add new pivot row to column */ - A [Col [col].start + (Col [col].length++)] = pivot_row ; - - /* retrieve score so far and add on pivot row's degree. */ - /* (we wait until here for this in case the pivot */ - /* row's degree was reduced due to mass elimination). */ - cur_score = Col [col].shared2.score + pivot_row_degree ; - - /* calculate the max possible score as the number of */ - /* external columns minus the 'k' value minus the */ - /* columns thickness */ - max_score = n_col - k - Col [col].shared1.thickness ; - - /* make the score the external degree of the union-of-rows */ - cur_score -= Col [col].shared1.thickness ; - - /* make sure score is less or equal than the max score */ - cur_score = MIN (cur_score, max_score) ; - assert (cur_score >= 0) ; - - /* store updated score */ - Col [col].shared2.score = cur_score ; - - /* === Place column back in degree list ========================= */ - - assert (min_score >= 0) ; - assert (min_score <= n_col) ; - assert (cur_score >= 0) ; - assert (cur_score <= n_col) ; - assert (head [cur_score] >= EMPTY) ; - next_col = head [cur_score] ; - Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = EMPTY ; - if (next_col != EMPTY) - { - Col [next_col].shared3.prev = col ; - } - head [cur_score] = col ; - - /* see if this score is less than current min */ - min_score = MIN (min_score, cur_score) ; - - } - -#ifndef NDEBUG - debug_deg_lists (n_row, n_col, Row, Col, head, - min_score, n_col2-k, max_deg) ; -#endif - - /* === Resurrect the new pivot row ================================== */ - - if (pivot_row_degree > 0) - { - /* update pivot row length to reflect any cols that were killed */ - /* during super-col detection and mass elimination */ - Row [pivot_row].start = pivot_row_start ; - Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ; - Row [pivot_row].shared1.degree = pivot_row_degree ; - Row [pivot_row].shared2.mark = 0 ; - /* pivot row is no longer dead */ - } - } - - /* === All principal columns have now been ordered ====================== */ - - return (ngarbage) ; -} - - -/* ========================================================================== */ -/* === order_children ======================================================= */ -/* ========================================================================== */ - -/* - The find_ordering routine has ordered all of the principal columns (the - representatives of the supercolumns). The non-principal columns have not - yet been ordered. This routine orders those columns by walking up the - parent tree (a column is a child of the column which absorbed it). The - final permutation vector is then placed in p [0 ... n_col-1], with p [0] - being the first column, and p [n_col-1] being the last. It doesn't look - like it at first glance, but be assured that this routine takes time linear - in the number of columns. Although not immediately obvious, the time - taken by this routine is O (n_col), that is, linear in the number of - columns. Not user-callable. -*/ - -PRIVATE void order_children -( - /* === Parameters ======================================================= */ - - int n_col, /* number of columns of A */ - ColInfo Col [], /* of size n_col+1 */ - int p [] /* p [0 ... n_col-1] is the column permutation*/ -) -{ - /* === Local variables ================================================== */ - - int i ; /* loop counter for all columns */ - int c ; /* column index */ - int parent ; /* index of column's parent */ - int order ; /* column's order */ - - /* === Order each non-principal column ================================== */ - - for (i = 0 ; i < n_col ; i++) - { - /* find an un-ordered non-principal column */ - assert (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY) - { - parent = i ; - /* once found, find its principal parent */ - do - { - parent = Col [parent].shared1.parent ; - } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; - - /* now, order all un-ordered non-principal columns along path */ - /* to this parent. collapse tree at the same time */ - c = i ; - /* get order of parent */ - order = Col [parent].shared2.order ; - - do - { - assert (Col [c].shared2.order == EMPTY) ; - - /* order this column */ - Col [c].shared2.order = order++ ; - /* collaps tree */ - Col [c].shared1.parent = parent ; - - /* get immediate parent of this column */ - c = Col [c].shared1.parent ; - - /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ - /* above an ordered column */ - } while (Col [c].shared2.order == EMPTY) ; - - /* re-order the super_col parent to largest order for this group */ - Col [parent].shared2.order = order ; - } - } - - /* === Generate the permutation ========================================= */ - - for (c = 0 ; c < n_col ; c++) - { - p [Col [c].shared2.order] = c ; - } -} - - -/* ========================================================================== */ -/* === detect_super_cols ==================================================== */ -/* ========================================================================== */ - -/* - Detects supercolumns by finding matches between columns in the hash buckets. - Check amongst columns in the set A [row_start ... row_start + row_length-1]. - The columns under consideration are currently *not* in the degree lists, - and have already been placed in the hash buckets. - - The hash bucket for columns whose hash function is equal to h is stored - as follows: - - if head [h] is >= 0, then head [h] contains a degree list, so: - - head [h] is the first column in degree bucket h. - Col [head [h]].headhash gives the first column in hash bucket h. - - otherwise, the degree list is empty, and: - - -(head [h] + 2) is the first column in hash bucket h. - - For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous - column" pointer. Col [c].shared3.hash is used instead as the hash number - for that column. The value of Col [c].shared4.hash_next is the next column - in the same hash bucket. - - Assuming no, or "few" hash collisions, the time taken by this routine is - linear in the sum of the sizes (lengths) of each column whose score has - just been computed in the approximate degree computation. - Not user-callable. -*/ - -PRIVATE void detect_super_cols -( - /* === Parameters ======================================================= */ - -#ifndef NDEBUG - /* these two parameters are only needed when debugging is enabled: */ - int n_col, /* number of columns of A */ - RowInfo Row [], /* of size n_row+1 */ -#endif - ColInfo Col [], /* of size n_col+1 */ - int A [], /* row indices of A */ - int head [], /* head of degree lists and hash buckets */ - int row_start, /* pointer to set of columns to check */ - int row_length /* number of columns to check */ -) -{ - /* === Local variables ================================================== */ - - int hash ; /* hash # for a column */ - int *rp ; /* pointer to a row */ - int c ; /* a column index */ - int super_c ; /* column index of the column to absorb into */ - int *cp1 ; /* column pointer for column super_c */ - int *cp2 ; /* column pointer for column c */ - int length ; /* length of column super_c */ - int prev_c ; /* column preceding c in hash bucket */ - int i ; /* loop counter */ - int *rp_end ; /* pointer to the end of the row */ - int col ; /* a column index in the row to check */ - int head_column ; /* first column in hash bucket or degree list */ - int first_col ; /* first column in hash bucket */ - - /* === Consider each column in the row ================================== */ - - rp = &A [row_start] ; - rp_end = rp + row_length ; - while (rp < rp_end) - { - col = *rp++ ; - if (COL_IS_DEAD (col)) - { - continue ; - } - - /* get hash number for this column */ - hash = Col [col].shared3.hash ; - assert (hash <= n_col) ; - - /* === Get the first column in this hash bucket ===================== */ - - head_column = head [hash] ; - if (head_column > EMPTY) - { - first_col = Col [head_column].shared3.headhash ; - } - else - { - first_col = - (head_column + 2) ; - } - - /* === Consider each column in the hash bucket ====================== */ - - for (super_c = first_col ; super_c != EMPTY ; - super_c = Col [super_c].shared4.hash_next) - { - assert (COL_IS_ALIVE (super_c)) ; - assert (Col [super_c].shared3.hash == hash) ; - length = Col [super_c].length ; - - /* prev_c is the column preceding column c in the hash bucket */ - prev_c = super_c ; - - /* === Compare super_c with all columns after it ================ */ - - for (c = Col [super_c].shared4.hash_next ; - c != EMPTY ; c = Col [c].shared4.hash_next) - { - assert (c != super_c) ; - assert (COL_IS_ALIVE (c)) ; - assert (Col [c].shared3.hash == hash) ; - - /* not identical if lengths or scores are different */ - if (Col [c].length != length || - Col [c].shared2.score != Col [super_c].shared2.score) - { - prev_c = c ; - continue ; - } - - /* compare the two columns */ - cp1 = &A [Col [super_c].start] ; - cp2 = &A [Col [c].start] ; - - for (i = 0 ; i < length ; i++) - { - /* the columns are "clean" (no dead rows) */ - assert (ROW_IS_ALIVE (*cp1)) ; - assert (ROW_IS_ALIVE (*cp2)) ; - /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ - if (*cp1++ != *cp2++) - { - break ; - } - } - - /* the two columns are different if the for-loop "broke" */ - if (i != length) - { - prev_c = c ; - continue ; - } - - /* === Got it! two columns are identical =================== */ - - assert (Col [c].shared2.score == Col [super_c].shared2.score) ; - - Col [super_c].shared1.thickness += Col [c].shared1.thickness ; - Col [c].shared1.parent = super_c ; - KILL_NON_PRINCIPAL_COL (c) ; - /* order c later, in order_children() */ - Col [c].shared2.order = EMPTY ; - /* remove c from hash bucket */ - Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; - } - } - - /* === Empty this hash bucket ======================================= */ - - if (head_column > EMPTY) - { - /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = EMPTY ; - } - else - { - /* corresponding degree list "hash" is empty */ - head [hash] = EMPTY ; - } - } -} - - -/* ========================================================================== */ -/* === garbage_collection =================================================== */ -/* ========================================================================== */ - -/* - Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns - the index of the first free position in A, after garbage collection. The - time taken by this routine is linear is the size of the array A, which is - itself linear in the number of nonzeros in the input matrix. - Not user-callable. -*/ - -PRIVATE int garbage_collection /* returns the new value of pfree */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows */ - int n_col, /* number of columns */ - RowInfo Row [], /* row info */ - ColInfo Col [], /* column info */ - int A [], /* A [0 ... Alen-1] holds the matrix */ - int *pfree /* &A [0] ... pfree is in use */ -) -{ - /* === Local variables ================================================== */ - - int *psrc ; /* source pointer */ - int *pdest ; /* destination pointer */ - int j ; /* counter */ - int r ; /* a row index */ - int c ; /* a column index */ - int length ; /* length of a row or column */ - -#ifndef NDEBUG - int debug_rows ; - DEBUG0 (("Defrag..\n")) ; - for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ; - debug_rows = 0 ; -#endif - - /* === Defragment the columns =========================================== */ - - pdest = &A[0] ; - for (c = 0 ; c < n_col ; c++) - { - if (COL_IS_ALIVE (c)) - { - psrc = &A [Col [c].start] ; - - /* move and compact the column */ - assert (pdest <= psrc) ; - Col [c].start = (int) (pdest - &A [0]) ; - length = Col [c].length ; - for (j = 0 ; j < length ; j++) - { - r = *psrc++ ; - if (ROW_IS_ALIVE (r)) - { - *pdest++ = r ; - } - } - Col [c].length = (int) (pdest - &A [Col [c].start]) ; - } - } - - /* === Prepare to defragment the rows =================================== */ - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - if (Row [r].length == 0) - { - /* this row is of zero length. cannot compact it, so kill it */ - DEBUG0 (("Defrag row kill\n")) ; - KILL_ROW (r) ; - } - else - { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - assert (ROW_IS_ALIVE (r)) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ONES_COMPLEMENT (r) ; -#ifndef NDEBUG - debug_rows++ ; -#endif - } - } - } - - /* === Defragment the rows ============================================== */ - - psrc = pdest ; - while (psrc < pfree) - { - /* find a negative number ... the start of a row */ - if (*psrc++ < 0) - { - psrc-- ; - /* get the row index */ - r = ONES_COMPLEMENT (*psrc) ; - assert (r >= 0 && r < n_row) ; - /* restore first column index */ - *psrc = Row [r].shared2.first_column ; - assert (ROW_IS_ALIVE (r)) ; - - /* move and compact the row */ - assert (pdest <= psrc) ; - Row [r].start = (int) (pdest - &A [0]) ; - length = Row [r].length ; - for (j = 0 ; j < length ; j++) - { - c = *psrc++ ; - if (COL_IS_ALIVE (c)) - { - *pdest++ = c ; - } - } - Row [r].length = (int) (pdest - &A [Row [r].start]) ; -#ifndef NDEBUG - debug_rows-- ; -#endif - } - } - /* ensure we found all the rows */ - assert (debug_rows == 0) ; - - /* === Return the new value of pfree ==================================== */ - - return ((int) (pdest - &A [0])) ; -} - - -/* ========================================================================== */ -/* === clear_mark =========================================================== */ -/* ========================================================================== */ - -/* - Clears the Row [].shared2.mark array, and returns the new tag_mark. - Return value is the new tag_mark. Not user-callable. -*/ - -PRIVATE int clear_mark /* return the new value for tag_mark */ -( - /* === Parameters ======================================================= */ - - int n_row, /* number of rows in A */ - RowInfo Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ -) -{ - /* === Local variables ================================================== */ - - int r ; - - DEBUG0 (("Clear mark\n")) ; - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - Row [r].shared2.mark = 0 ; - } - } - return (1) ; -} - - -/* ========================================================================== */ -/* === debugging routines =================================================== */ -/* ========================================================================== */ - -/* When debugging is disabled, the remainder of this file is ignored. */ - -#ifndef NDEBUG - - -/* ========================================================================== */ -/* === debug_structures ===================================================== */ -/* ========================================================================== */ - -/* - At this point, all empty rows and columns are dead. All live columns - are "clean" (containing no dead rows) and simplicial (no supercolumns - yet). Rows may contain dead columns, but all live rows contain at - least one live column. -*/ - -PRIVATE void debug_structures -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [], - int n_col2 -) -{ - /* === Local variables ================================================== */ - - int i ; - int c ; - int *cp ; - int *cp_end ; - int len ; - int score ; - int r ; - int *rp ; - int *rp_end ; - int deg ; - - /* === Check A, Row, and Col ============================================ */ - - for (c = 0 ; c < n_col ; c++) - { - if (COL_IS_ALIVE (c)) - { - len = Col [c].length ; - score = Col [c].shared2.score ; - DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ; - assert (len > 0) ; - assert (score >= 0) ; - assert (Col [c].shared1.thickness == 1) ; - cp = &A [Col [c].start] ; - cp_end = cp + len ; - while (cp < cp_end) - { - r = *cp++ ; - assert (ROW_IS_ALIVE (r)) ; - } - } - else - { - i = Col [c].shared2.order ; - assert (i >= n_col2 && i < n_col) ; - } - } - - for (r = 0 ; r < n_row ; r++) - { - if (ROW_IS_ALIVE (r)) - { - i = 0 ; - len = Row [r].length ; - deg = Row [r].shared1.degree ; - assert (len > 0) ; - assert (deg > 0) ; - rp = &A [Row [r].start] ; - rp_end = rp + len ; - while (rp < rp_end) - { - c = *rp++ ; - if (COL_IS_ALIVE (c)) - { - i++ ; - } - } - assert (i > 0) ; - } - } -} - - -/* ========================================================================== */ -/* === debug_deg_lists ====================================================== */ -/* ========================================================================== */ - -/* - Prints the contents of the degree lists. Counts the number of columns - in the degree list and compares it to the total it should have. Also - checks the row degrees. -*/ - -PRIVATE void debug_deg_lists -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int head [], - int min_score, - int should, - int max_deg -) -{ - /* === Local variables ================================================== */ - - int deg ; - int col ; - int have ; - int row ; - - /* === Check the degree lists =========================================== */ - - if (n_col > 10000 && debug_colamd <= 0) - { - return ; - } - have = 0 ; - DEBUG4 (("Degree lists: %d\n", min_score)) ; - for (deg = 0 ; deg <= n_col ; deg++) - { - col = head [deg] ; - if (col == EMPTY) - { - continue ; - } - DEBUG4 (("%d:", deg)) ; - while (col != EMPTY) - { - DEBUG4 ((" %d", col)) ; - have += Col [col].shared1.thickness ; - assert (COL_IS_ALIVE (col)) ; - col = Col [col].shared4.degree_next ; - } - DEBUG4 (("\n")) ; - } - DEBUG4 (("should %d have %d\n", should, have)) ; - assert (should == have) ; - - /* === Check the row degrees ============================================ */ - - if (n_row > 10000 && debug_colamd <= 0) - { - return ; - } - for (row = 0 ; row < n_row ; row++) - { - if (ROW_IS_ALIVE (row)) - { - assert (Row [row].shared1.degree <= max_deg) ; - } - } -} - - -/* ========================================================================== */ -/* === debug_mark =========================================================== */ -/* ========================================================================== */ - -/* - Ensures that the tag_mark is less that the maximum and also ensures that - each entry in the mark array is less than the tag mark. -*/ - -PRIVATE void debug_mark -( - /* === Parameters ======================================================= */ - - int n_row, - RowInfo Row [], - int tag_mark, - int max_mark -) -{ - /* === Local variables ================================================== */ - - int r ; - - /* === Check the Row marks ============================================== */ - - assert (tag_mark > 0 && tag_mark <= max_mark) ; - if (n_row > 10000 && debug_colamd <= 0) - { - return ; - } - for (r = 0 ; r < n_row ; r++) - { - assert (Row [r].shared2.mark < tag_mark) ; - } -} - - -/* ========================================================================== */ -/* === debug_matrix ========================================================= */ -/* ========================================================================== */ - -/* - Prints out the contents of the columns and the rows. -*/ - -PRIVATE void debug_matrix -( - /* === Parameters ======================================================= */ - - int n_row, - int n_col, - RowInfo Row [], - ColInfo Col [], - int A [] -) -{ - /* === Local variables ================================================== */ - - int r ; - int c ; - int *rp ; - int *rp_end ; - int *cp ; - int *cp_end ; - - /* === Dump the rows and columns of the matrix ========================== */ - - if (debug_colamd < 3) - { - return ; - } - DEBUG3 (("DUMP MATRIX:\n")) ; - for (r = 0 ; r < n_row ; r++) - { - DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ; - if (ROW_IS_DEAD (r)) - { - continue ; - } - DEBUG3 (("start %d length %d degree %d\n", - Row [r].start, Row [r].length, Row [r].shared1.degree)) ; - rp = &A [Row [r].start] ; - rp_end = rp + Row [r].length ; - while (rp < rp_end) - { - c = *rp++ ; - DEBUG3 ((" %d col %d\n", COL_IS_ALIVE (c), c)) ; - } - } - - for (c = 0 ; c < n_col ; c++) - { - DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ; - if (COL_IS_DEAD (c)) - { - continue ; - } - DEBUG3 (("start %d length %d shared1 %d shared2 %d\n", - Col [c].start, Col [c].length, - Col [c].shared1.thickness, Col [c].shared2.score)) ; - cp = &A [Col [c].start] ; - cp_end = cp + Col [c].length ; - while (cp < cp_end) - { - r = *cp++ ; - DEBUG3 ((" %d row %d\n", ROW_IS_ALIVE (r), r)) ; - } - } -} - -#endif - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,67 +0,0 @@ -/* ========================================================================== */ -/* === colamd prototypes and definitions ==================================== */ -/* ========================================================================== */ - -/* - This is the colamd include file, - - http://www.cise.ufl.edu/~davis/colamd/colamd.h - - for use in the colamd.c, colamdmex.c, and symamdmex.c files located at - - http://www.cise.ufl.edu/~davis/colamd/ - - See those files for a description of colamd and symamd, and for the - copyright notice, which also applies to this file. - - August 3, 1998. Version 1.0. -*/ - -/* ========================================================================== */ -/* === Definitions ========================================================== */ -/* ========================================================================== */ - -/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -#define COLAMD_KNOBS 20 - -/* number of output statistics. Only A [0..2] are currently used. */ -#define COLAMD_STATS 20 - -/* knobs [0] and A [0]: dense row knob and output statistic. */ -#define COLAMD_DENSE_ROW 0 - -/* knobs [1] and A [1]: dense column knob and output statistic. */ -#define COLAMD_DENSE_COL 1 - -/* A [2]: memory defragmentation count output statistic */ -#define COLAMD_DEFRAG_COUNT 2 - -/* A [3]: whether or not the input columns were jumbled or had duplicates */ -#define COLAMD_JUMBLED_COLS 3 - -/* ========================================================================== */ -/* === Prototypes of user-callable routines ================================= */ -/* ========================================================================== */ - -int colamd_recommended /* returns recommended value of Alen */ -( - int nnz, /* nonzeros in A */ - int n_row, /* number of rows in A */ - int n_col /* number of columns in A */ -) ; - -void colamd_set_defaults /* sets default parameters */ -( /* knobs argument is modified on output */ - double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ -) ; - -int colamd /* returns TRUE if successful, FALSE otherwise*/ -( /* A and p arguments are modified on output */ - int n_row, /* number of rows in A */ - int n_col, /* number of columns in A */ - int Alen, /* size of the array A */ - int A [], /* row indices of A, of size Alen */ - int p [], /* column pointers of A, of size n_col+1 */ - double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ -) ; - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/relax_snode.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/relax_snode.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/relax_snode.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/relax_snode.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,71 +0,0 @@ -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_ddefs.h" - -void -relax_snode ( - const int n, - int *et, /* column elimination tree */ - const int relax_columns, /* max no of columns allowed in a - relaxed snode */ - int *descendants, /* no of descendants of each node - in the etree */ - int *relax_end /* last column in a supernode */ - ) -{ -/* - * Purpose - * ======= - * relax_snode() - Identify the initial relaxed supernodes, assuming that - * the matrix has been reordered according to the postorder of the etree. - * - */ - register int j, parent; - register int snode_start; /* beginning of a snode */ - - ifill (relax_end, n, EMPTY); - for (j = 0; j < n; j++) descendants[j] = 0; - - /* Compute the number of descendants of each node in the etree */ - for (j = 0; j < n; j++) { - parent = et[j]; - if ( parent != n ) /* not the dummy root */ - descendants[parent] += descendants[j] + 1; - } - - /* Identify the relaxed supernodes by postorder traversal of the etree. */ - for (j = 0; j < n; ) { - parent = et[j]; - snode_start = j; - while ( parent != n && descendants[parent] < relax_columns ) { - j = parent; - parent = et[j]; - } - /* Found a supernode with j being the last column. */ - relax_end[snode_start] = j; /* Last column is recorded */ - j++; - /* Search for a new leaf */ - while ( descendants[j] != 0 && j < n ) j++; - } - - /*printf("No of relaxed snodes: %d; relaxed columns: %d\n", - nsuper, no_relaxed_col); */ -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,348 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_sdefs.h" - -/* - * Function prototypes - */ -void susolve(int, int, float*, float*); -void slsolve(int, int, float*, float*); -void smatvec(int, int, int, float*, float*, float*); - - - -/* Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -scolumn_bmod ( - const int jcol, /* in */ - const int nseg, /* in */ - float *dense, /* in */ - float *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in */ - int fpanelc, /* in -- first column in the current panel */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose: - * ======== - * Performs numeric block updates (sup-col) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - float alpha, beta; - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in supernode - * nsupr = no of rows in supernode (used as leading dimension) - * luptr = location of supernodal LU-block in storage - * kfnz = first nonz in the k-th supernodal segment - * no_zeros = no of leading zeros in a supernodal U-segment - */ - float ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int fsupc, nsupc, nsupr, segsze; - int nrow; /* No of rows in the matrix of matrix-vector */ - int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; - register int lptr, kfnz, isub, irow, i; - register int no_zeros, new_next; - int ufirst, nextlu; - int fst_col; /* First column within small LU update */ - int d_fsupc; /* Distance between the first column of the current - panel and the first column of the current snode. */ - int *xsup, *supno; - int *lsub, *xlsub; - float *lusup; - int *xlusup; - int nzlumax; - float *tempv1; - float zero = 0.0; - float one = 1.0; - float none = -1.0; - int mem_error; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - nzlumax = Glu->nzlumax; - jcolp1 = jcol + 1; - jsupno = supno[jcol]; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - - krep = segrep[k]; - k--; - ksupno = supno[krep]; - if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ - - fsupc = xsup[ksupno]; - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - /* Distance from the current supernode to the current panel; - d_fsupc=0 if fsupc > fpanelc. */ - d_fsupc = fst_col - fsupc; - - luptr = xlusup[fst_col] + d_fsupc; - lptr = xlsub[fsupc] + d_fsupc; - - kfnz = repfnz[krep]; - kfnz = SUPERLU_MAX ( kfnz, fpanelc ); - - segsze = krep - kfnz + 1; - nsupc = krep - fst_col + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nrow = nsupr - d_fsupc - nsupc; - krep_ind = lptr + nsupc - 1; - - ops[TRSV] += segsze * (segsze - 1); - ops[GEMV] += 2 * nrow * segsze; - - - /* - * Case 1: Update U-segment of size 1 -- col-col update - */ - if ( segsze == 1 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - dense[irow] -= ukj*lusup[luptr]; - luptr++; - } - - } else if ( segsze <= 3 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { /* Case 2: 2cols-col update */ - ukj -= ukj1 * lusup[luptr1]; - dense[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - dense[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] ); - } - } else { /* Case 3: 3cols-col update */ - ukj2 = dense[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - ukj1 -= ukj2 * lusup[luptr2-1]; - ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; - dense[lsub[krep_ind]] = ukj; - dense[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - luptr2++; - dense[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); - } - } - - - - } else { - /* - * Case: sup-col update - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense - */ - - no_zeros = kfnz - fst_col; - - /* Copy U[*,j] segment from dense[*] to tempv[*] */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - tempv[i] = dense[irow]; - ++isub; - } - - /* Dense triangular solve -- start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - strsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#endif - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - slsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); -#endif - - - /* Scatter tempv[] into SPA dense[] as a temporary storage */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense[irow] = tempv[i]; - tempv[i] = zero; - ++isub; - } - - /* Scatter tempv1[] into SPA dense[] */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - dense[irow] -= tempv1[i]; - tempv1[i] = zero; - ++isub; - } - } - - } /* if jsupno ... */ - - } /* for each segment... */ - - /* - * Process the supernodal portion of L\U[*,j] - */ - nextlu = xlusup[jcol]; - fsupc = xsup[jsupno]; - - /* Copy the SPA dense into L\U[*,j] */ - new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; - while ( new_next > nzlumax ) { - if (mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) - return (mem_error); - lusup = Glu->lusup; - lsub = Glu->lsub; - } - - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = zero; - ++nextlu; - } - - xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ - - /* For more updates within the panel (also within the current supernode), - * should start from the first column of the panel, or the first column - * of the supernode, whichever is bigger. There are 2 cases: - * 1) fsupc < fpanelc, then fst_col := fpanelc - * 2) fsupc >= fpanelc, then fst_col := fsupc - */ - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - if ( fst_col < jcol ) { - - /* Distance between the current supernode and the current panel. - d_fsupc=0 if fsupc >= fpanelc. */ - d_fsupc = fst_col - fsupc; - - lptr = xlsub[fsupc] + d_fsupc; - luptr = xlusup[fst_col] + d_fsupc; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nsupc = jcol - fst_col; /* Excluding jcol */ - nrow = nsupr - d_fsupc - nsupc; - - /* Points to the beginning of jcol in snode L\U(jsupno) */ - ufirst = xlusup[jcol] + d_fsupc; - - ops[TRSV] += nsupc * (nsupc - 1); - ops[GEMV] += 2 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#else - strsv_( "L", "N", "U", &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#endif - - alpha = none; beta = one; /* y := beta*y + alpha*A*x */ - -#ifdef _CRAY - SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - - smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], tempv ); - - /* Copy updates from tempv[*] into lusup[*] */ - isub = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - lusup[isub] -= tempv[i]; - tempv[i] = 0.0; - ++isub; - } - -#endif - - - } /* if fst_col < jcol ... */ - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,269 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - -/* What type of supernodes we want */ -#define T2_SUPER - -int -scolumn_dfs( - const int m, /* in - number of rows in the matrix */ - const int jcol, /* in */ - int *perm_r, /* in */ - int *nseg, /* modified - with new segments appended */ - int *lsub_col, /* in - defines the RHS vector to start the dfs */ - int *segrep, /* modified - with new segments appended */ - int *repfnz, /* modified */ - int *xprune, /* modified */ - int *marker, /* modified */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * "column_dfs" performs a symbolic factorization on column jcol, and - * decide the supernode boundary. - * - * This routine does not use numeric values, but only use the RHS - * row indices to start the dfs. - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. The routine returns a list of such supernodal - * representatives in topological order of the dfs that generates them. - * The location of the first nonzero in each such supernodal segment - * (supernodal entry location) is also returned. - * - * Local parameters - * ================ - * nseg: no of segments in current U[*,j] - * jsuper: jsuper=EMPTY if column j does not belong to the same - * supernode as j-1. Otherwise, jsuper=nsuper. - * - * marker2: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - * Return value - * ============ - * 0 success; - * > 0 number of bytes allocated when run out of space. - * - */ - int jcolp1, jcolm1, jsuper, nsuper, nextl; - int k, krep, krow, kmark, kperm; - int *marker2; /* Used for small panel LU */ - int fsupc; /* First column of a snode */ - int myfnz; /* First nonz column of a U-segment */ - int chperm, chmark, chrep, kchild; - int xdfs, maxdfs, kpar, oldrep; - int jptr, jm1ptr; - int ito, ifrom, istop; /* Used to compress row subscripts */ - int mem_error; - int *xsup, *supno, *lsub, *xlsub; - int nzlmax; - static int first = 1, maxsuper; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - if ( first ) { - maxsuper = sp_ienv(3); - first = 0; - } - jcolp1 = jcol + 1; - jcolm1 = jcol - 1; - nsuper = supno[jcol]; - jsuper = nsuper; - nextl = xlsub[jcol]; - marker2 = &marker[2*m]; - - - /* For each nonzero in A[*,jcol] do dfs */ - for (k = 0; lsub_col[k] != EMPTY; k++) { - - krow = lsub_col[k]; - lsub_col[k] = EMPTY; - kmark = marker2[krow]; - - /* krow was visited before, go to the next nonz */ - if ( kmark == jcol ) continue; - - /* For each unmarked nbr krow of jcol - * krow is in L: place it in structure of L[*,jcol] - */ - marker2[krow] = jcol; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - lsub[nextl++] = krow; /* krow is indexed into A */ - if ( nextl >= nzlmax ) { - if ( mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ - } else { - /* krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz[krep]; - - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > kperm ) repfnz[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker2[kchild]; - - if ( chmark != jcol ) { /* Not reached yet */ - marker2[kchild] = jcol; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,k] */ - if ( chperm == EMPTY ) { - lsub[nextl++] = kchild; - if ( nextl >= nzlmax ) { - if ( mem_error = - sLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( chmark != jcolm1 ) jsuper = EMPTY; - } else { - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz[chrep]; - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz[chrep] = chperm; - } else { - /* Continue dfs at super-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L^t) */ - parent[krep] = oldrep; - repfnz[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - } /* else */ - - } /* else */ - - } /* if */ - - } /* while */ - - /* krow has no more unexplored nbrs; - * place supernode-rep krep in postorder DFS. - * backtrack dfs to its parent - */ - segrep[*nseg] = krep; - ++(*nseg); - kpar = parent[krep]; /* Pop from stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - - } while ( kpar != EMPTY ); /* Until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonzero ... */ - - /* Check to see if j belongs in the same supernode as j-1 */ - if ( jcol == 0 ) { /* Do nothing for column 0 */ - nsuper = supno[0] = 0; - } else { - fsupc = xsup[nsuper]; - jptr = xlsub[jcol]; /* Not compressed yet */ - jm1ptr = xlsub[jcolm1]; - -#ifdef T2_SUPER - if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; -#endif - /* Make sure the number of columns in a supernode doesn't - exceed threshold. */ - if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; - - /* If jcol starts a new supernode, reclaim storage space in - * lsub from the previous supernode. Note we only store - * the subscript set of the first and last columns of - * a supernode. (first for num values, last for pruning) - */ - if ( jsuper == EMPTY ) { /* starts a new supernode */ - if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ -#ifdef CHK_COMPRESS - printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); -#endif - ito = xlsub[fsupc+1]; - xlsub[jcolm1] = ito; - istop = ito + jptr - jm1ptr; - xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ - xlsub[jcol] = istop; - for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) - lsub[ito] = lsub[ifrom]; - nextl = ito; /* = istop + length(jcol) */ - } - nsuper++; - supno[jcol] = nsuper; - } /* if a new supernode */ - - } /* else: jcol > 0 */ - - /* Tidy up the pointers before exit */ - xsup[nsuper+1] = jcolp1; - supno[jcolp1] = nsuper; - xprune[jcol] = nextl; /* Initialize upper bound for pruning */ - xlsub[jcolp1] = nextl; - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scomplex.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scomplex.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scomplex.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scomplex.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,106 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * This file defines common arithmetic operations for complex type. - */ -#include -#include -#include -#include "slu_scomplex.h" - - -/* Complex Division c = a/b */ -void c_div(complex *c, complex *a, complex *b) -{ - float ratio, den; - float abr, abi, cr, ci; - - if( (abr = b->r) < 0.) - abr = - abr; - if( (abi = b->i) < 0.) - abi = - abi; - if( abr <= abi ) { - if (abi == 0) { - fprintf(stderr, "z_div.c: division by zero\n"); - exit(-1); - } - ratio = b->r / b->i ; - den = b->i * (1 + ratio*ratio); - cr = (a->r*ratio + a->i) / den; - ci = (a->i*ratio - a->r) / den; - } else { - ratio = b->i / b->r ; - den = b->r * (1 + ratio*ratio); - cr = (a->r + a->i*ratio) / den; - ci = (a->i - a->r*ratio) / den; - } - c->r = cr; - c->i = ci; -} - - -/* Returns sqrt(z.r^2 + z.i^2) */ -double c_abs(complex *z) -{ - float temp; - float real = z->r; - float imag = z->i; - - if (real < 0) real = -real; - if (imag < 0) imag = -imag; - if (imag > real) { - temp = real; - real = imag; - imag = temp; - } - if ((real+imag) == real) return(real); - - temp = imag/real; - temp = real*sqrt(1.0 + temp*temp); /*overflow!!*/ - return (temp); -} - - -/* Approximates the abs */ -/* Returns abs(z.r) + abs(z.i) */ -double c_abs1(complex *z) -{ - float real = z->r; - float imag = z->i; - - if (real < 0) real = -real; - if (imag < 0) imag = -imag; - - return (real + imag); -} - -/* Return the exponentiation */ -void c_exp(complex *r, complex *z) -{ - float expx; - - expx = exp(z->r); - r->r = expx * cos(z->i); - r->i = expx * sin(z->i); -} - -/* Return the complex conjugate */ -void r_cnjg(complex *r, complex *z) -{ - r->r = z->r; - r->i = -z->i; -} - -/* Return the imaginary part */ -double r_imag(complex *z) -{ - return (z->i); -} - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,103 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - -int -scopy_to_ucol( - int jcol, /* in */ - int nseg, /* in */ - int *segrep, /* in */ - int *repfnz, /* in */ - int *perm_r, /* in */ - float *dense, /* modified - reset to zero on return */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Gather from SPA dense[*] to global ucol[*]. - */ - int ksub, krep, ksupno; - int i, k, kfnz, segsze; - int fsupc, isub, irow; - int jsupno, nextu; - int new_next, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - float *ucol; - int *usub, *xusub; - int nzumax; - - float zero = 0.0; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - nzumax = Glu->nzumax; - - jsupno = supno[jcol]; - nextu = xusub[jcol]; - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - krep = segrep[k--]; - ksupno = supno[krep]; - - if ( ksupno != jsupno ) { /* Should go into ucol[] */ - kfnz = repfnz[krep]; - if ( kfnz != EMPTY ) { /* Nonzero U-segment */ - - fsupc = xsup[ksupno]; - isub = xlsub[fsupc] + kfnz - fsupc; - segsze = krep - kfnz + 1; - - new_next = nextu + segsze; - while ( new_next > nzumax ) { - if (mem_error = sLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)) - return (mem_error); - ucol = Glu->ucol; - if (mem_error = sLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)) - return (mem_error); - usub = Glu->usub; - lsub = Glu->lsub; - } - - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - usub[nextu] = perm_r[irow]; - ucol[nextu] = dense[irow]; - dense[irow] = zero; - nextu++; - isub++; - } - - } - - } - - } /* for each segment... */ - - xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scsum1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scsum1.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scsum1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scsum1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,96 +0,0 @@ -#include "slu_Cnames.h" -#include "slu_scomplex.h" - -double scsum1_(int *n, complex *cx, int *incx) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SCSUM1 takes the sum of the absolute values of a complex - vector and returns a single precision result. - - Based on SCASUM from the Level 1 BLAS. - The change is to use the 'genuine' absolute value. - - Contributed by Nick Higham for use with CLACON. - - Arguments - ========= - - N (input) INT - The number of elements in the vector CX. - - CX (input) COMPLEX array, dimension (N) - The vector whose elements will be summed. - - INCX (input) INT - The spacing between successive values of CX. INCX > 0. - - ===================================================================== - - - - - Parameter adjustments - Function Body */ - /* System generated locals */ - int i__1, i__2; - float ret_val; - /* Builtin functions */ - double c_abs(complex *); - /* Local variables */ - static int i, nincx; - static float stemp; - - -#define CX(I) cx[(I)-1] - - - ret_val = 0.f; - stemp = 0.f; - if (*n <= 0) { - return ret_val; - } - if (*incx == 1) { - goto L20; - } - -/* CODE FOR INCREMENT NOT EQUAL TO 1 */ - - nincx = *n * *incx; - i__1 = nincx; - i__2 = *incx; - for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { - -/* NEXT LINE MODIFIED. */ - - stemp += c_abs(&CX(i)); -/* L10: */ - } - ret_val = stemp; - return ret_val; - -/* CODE FOR INCREMENT EQUAL TO 1 */ - -L20: - i__2 = *n; - for (i = 1; i <= *n; ++i) { - -/* NEXT LINE MODIFIED. */ - - stemp += c_abs(&CX(i)); -/* L30: */ - } - ret_val = stemp; - return ret_val; - -/* End of SCSUM1 */ - -} /* scsum1_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgscon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgscon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgscon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,146 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: sgscon.c - * History: Modified from lapack routines SGECON. - */ -#include -#include "slu_sdefs.h" - -void -sgscon(char *norm, SuperMatrix *L, SuperMatrix *U, - float anorm, float *rcond, SuperLUStat_t *stat, int *info) -{ -/* - Purpose - ======= - - SGSCON estimates the reciprocal of the condition number of a general - real matrix A, in either the 1-norm or the infinity-norm, using - the LU factorization computed by SGETRF. - - An estimate is obtained for norm(inv(A)), and the reciprocal of the - condition number is computed as - RCOND = 1 / ( norm(A) * norm(inv(A)) ). - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - NORM (input) char* - Specifies whether the 1-norm condition number or the - infinity-norm condition number is required: - = '1' or 'O': 1-norm; - = 'I': Infinity-norm. - - L (input) SuperMatrix* - The factor L from the factorization Pr*A*Pc=L*U as computed by - sgstrf(). Use compressed row subscripts storage for supernodes, - i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. - - U (input) SuperMatrix* - The factor U from the factorization Pr*A*Pc=L*U as computed by - sgstrf(). Use column-wise storage scheme, i.e., U has types: - Stype = SLU_NC, Dtype = SLU_S, Mtype = TRU. - - ANORM (input) float - If NORM = '1' or 'O', the 1-norm of the original matrix A. - If NORM = 'I', the infinity-norm of the original matrix A. - - RCOND (output) float* - The reciprocal of the condition number of the matrix A, - computed as RCOND = 1/(norm(A) * norm(inv(A))). - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - - ===================================================================== -*/ - - /* Local variables */ - int kase, kase1, onenrm, i; - float ainvnm; - float *work; - int *iwork; - extern int srscl_(int *, float *, float *, int *); - - extern int slacon_(int *, float *, float *, int *, float *, int *); - - - /* Test the input parameters. */ - *info = 0; - onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); - if (! onenrm && ! lsame_(norm, "I")) *info = -1; - else if (L->nrow < 0 || L->nrow != L->ncol || - L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU) - *info = -2; - else if (U->nrow < 0 || U->nrow != U->ncol || - U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU) - *info = -3; - if (*info != 0) { - i = -(*info); - xerbla_("sgscon", &i); - return; - } - - /* Quick return if possible */ - *rcond = 0.; - if ( L->nrow == 0 || U->nrow == 0) { - *rcond = 1.; - return; - } - - work = floatCalloc( 3*L->nrow ); - iwork = intMalloc( L->nrow ); - - - if ( !work || !iwork ) - ABORT("Malloc fails for work arrays in sgscon."); - - /* Estimate the norm of inv(A). */ - ainvnm = 0.; - if ( onenrm ) kase1 = 1; - else kase1 = 2; - kase = 0; - - do { - slacon_(&L->nrow, &work[L->nrow], &work[0], &iwork[0], &ainvnm, &kase); - - if (kase == 0) break; - - if (kase == kase1) { - /* Multiply by inv(L). */ - sp_strsv("L", "No trans", "Unit", L, U, &work[0], stat, info); - - /* Multiply by inv(U). */ - sp_strsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); - - } else { - - /* Multiply by inv(U'). */ - sp_strsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); - - /* Multiply by inv(L'). */ - sp_strsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); - - } - - } while ( kase != 0 ); - - /* Compute the estimate of the reciprocal condition number. */ - if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; - - SUPERLU_FREE (work); - SUPERLU_FREE (iwork); - return; - -} /* sgscon */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsequ.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,184 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: sgsequ.c - * History: Modified from LAPACK routine SGEEQU - */ -#include -#include "slu_sdefs.h" - -void -sgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd, - float *colcnd, float *amax, int *info) -{ -/* - Purpose - ======= - - SGSEQU computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE. - - R (output) float*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) float*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) float* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) float* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) float* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= A->nrow: the i-th row of A is exactly zero - > A->ncol: the (i-M)-th column of A is exactly zero - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - float *Aval; - int i, j, irow; - float rcmin, rcmax; - float bignum, smlnum; - extern double slamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("sgsequ", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - - /* Get machine constants. */ - smlnum = slamch_("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = 0; i < A->nrow; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - return; - -} /* sgsequ */ - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsrfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsrfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsrfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,437 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: sgsrfs.c - * History: Modified from lapack routine SGERFS - */ -#include -#include "slu_sdefs.h" - -void -sgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, char *equed, float *R, float *C, - SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * SGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates for - * the solution. - * - * If equilibration was performed, the system becomes: - * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * A (input) SuperMatrix* - * The original matrix A in the system, or the scaled A if - * equilibration was done. The type of A can be: - * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_GE. - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * sgstrf(). Use column-wise storage scheme, - * i.e., U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (A->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * equed (input) Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by - * diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * - * R (input) float*, dimension (A->nrow) - * The row scale factors for A. - * If equed = 'R' or 'B', A is premultiplied by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * - * C (input) float*, dimension (A->ncol) - * The column scale factors for A. - * If equed = 'C' or 'B', A is postmultiplied by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * - * B (input) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. - * The right hand side matrix B. - * if equed = 'R' or 'B', B is premultiplied by diag(R). - * - * X (input/output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. - * On entry, the solution matrix X, as computed by sgstrs(). - * On exit, the improved solution matrix X. - * if *equed = 'C' or 'B', X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * FERR (output) float*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * - * BERR (output) float*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if INFO = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 5 - - /* Table of constant values */ - int ione = 1; - float ndone = -1.; - float done = 1.; - - /* Local variables */ - NCformat *Astore; - float *Aval; - SuperMatrix Bjcol; - DNformat *Bstore, *Xstore, *Bjcol_store; - float *Bmat, *Xmat, *Bptr, *Xptr; - int kase; - float safe1, safe2; - int i, j, k, irow, nz, count, notran, rowequ, colequ; - int ldb, ldx, nrhs; - float s, xk, lstres, eps, safmin; - char transc[1]; - trans_t transt; - float *work; - float *rwork; - int *iwork; - extern double slamch_(char *); - extern int slacon_(int *, float *, float *, int *, float *, int *); -#ifdef _CRAY - extern int SCOPY(int *, float *, int *, float *, int *); - extern int SSAXPY(int *, float *, float *, int *, float *, int *); -#else - extern int scopy_(int *, float *, int *, float *, int *); - extern int saxpy_(int *, float *, float *, int *, float *, int *); -#endif - - Astore = A->Store; - Aval = Astore->nzval; - Bstore = B->Store; - Xstore = X->Store; - Bmat = Bstore->nzval; - Xmat = Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - /* Test the input parameters */ - *info = 0; - notran = (trans == NOTRANS); - if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) - *info = -2; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU ) - *info = -3; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU ) - *info = -4; - else if ( ldb < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) - *info = -10; - else if ( ldx < SUPERLU_MAX(0, A->nrow) || - X->Stype != SLU_DN || X->Dtype != SLU_S || X->Mtype != SLU_GE ) - *info = -11; - if (*info != 0) { - i = -(*info); - xerbla_("sgsrfs", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || nrhs == 0) { - for (j = 0; j < nrhs; ++j) { - ferr[j] = 0.; - berr[j] = 0.; - } - return; - } - - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - - /* Allocate working space */ - work = floatMalloc(2*A->nrow); - rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) ); - iwork = intMalloc(2*A->nrow); - if ( !work || !rwork || !iwork ) - ABORT("Malloc fails for work/rwork/iwork."); - - if ( notran ) { - *(unsigned char *)transc = 'N'; - transt = TRANS; - } else { - *(unsigned char *)transc = 'T'; - transt = NOTRANS; - } - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = slamch_("Epsilon"); - safmin = slamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - - /* Compute the number of nonzeros in each row (or column) of A */ - for (i = 0; i < A->nrow; ++i) iwork[i] = 0; - if ( notran ) { - for (k = 0; k < A->ncol; ++k) - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - ++iwork[Astore->rowind[i]]; - } else { - for (k = 0; k < A->ncol; ++k) - iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; - } - - /* Copy one column of RHS B into Bjcol. */ - Bjcol.Stype = B->Stype; - Bjcol.Dtype = B->Dtype; - Bjcol.Mtype = B->Mtype; - Bjcol.nrow = B->nrow; - Bjcol.ncol = 1; - Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); - Bjcol_store = Bjcol.Store; - Bjcol_store->lda = ldb; - Bjcol_store->nzval = work; /* address aliasing */ - - /* Do for each right hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - Bptr = &Bmat[j*ldb]; - Xptr = &Xmat[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - -#ifdef _CRAY - SCOPY(&A->nrow, Bptr, &ione, work, &ione); -#else - scopy_(&A->nrow, Bptr, &ione, work, &ione); -#endif - sp_sgemv(transc, ndone, A, Xptr, ione, done, work, ione); - - /* Compute componentwise relative backward error from formula - max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) - where abs(Z) is the componentwise absolute value of the matrix - or vector Z. If the i-th component of the denominator is less - than SAFE2, then SAFE1 is added to the i-th component of the - numerator and denominator before dividing. */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if (notran) { - for (k = 0; k < A->ncol; ++k) { - xk = fabs( Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - s += fabs(Aval[i]) * fabs(Xptr[irow]); - } - rwork[k] += s; - } - } - s = 0.; - for (i = 0; i < A->nrow; ++i) { - if (rwork[i] > safe2) - s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] ); - else - s = SUPERLU_MAX( s, (fabs(work[i]) + safe1) / - (rwork[i] + safe1) ); - } - berr[j] = s; - - /* Test stopping criterion. Continue iterating if - 1) The residual BERR(J) is larger than machine epsilon, and - 2) BERR(J) decreased by at least a factor of 2 during the - last iteration, and - 3) At most ITMAX iterations tried. */ - - if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { - /* Update solution and try again. */ - sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - -#ifdef _CRAY - SAXPY(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#else - saxpy_(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#endif - lstres = berr[j]; - ++count; - } else { - break; - } - - } /* end while */ - - stat->RefineSteps = count; - - /* Bound error from formula: - norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* - ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) - where - norm(Z) is the magnitude of the largest component of Z - inv(op(A)) is the inverse of op(A) - abs(Z) is the componentwise absolute value of the matrix or - vector Z - NZ is the maximum number of nonzeros in any row of A, plus 1 - EPS is machine epsilon - - The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) - is incremented by SAFE1 if the i-th component of - abs(op(A))*abs(X) + abs(B) is less than SAFE2. - - Use SLACON to estimate the infinity-norm of the matrix - inv(op(A)) * diag(W), - where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if ( notran ) { - for (k = 0; k < A->ncol; ++k) { - xk = fabs( Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - xk = fabs( Xptr[irow] ); - s += fabs(Aval[i]) * xk; - } - rwork[k] += s; - } - } - - for (i = 0; i < A->nrow; ++i) - if (rwork[i] > safe2) - rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; - else - rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; - - kase = 0; - - do { - slacon_(&A->nrow, &work[A->nrow], work, - &iwork[A->nrow], &ferr[j], &kase); - if (kase == 0) break; - - if (kase == 1) { - /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; - else if ( !notran && rowequ ) - for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; - - sgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); - - for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; - } else { - /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ - for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; - - sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; - else if ( !notran && rowequ ) - for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; - } - - } while ( kase != 0 ); - - - /* Normalize error. */ - lstres = 0.; - if ( notran && colequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) ); - } else if ( !notran && rowequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) ); - } else { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) ); - } - if ( lstres != 0. ) - ferr[j] /= lstres; - - } /* for each RHS j ... */ - - SUPERLU_FREE(work); - SUPERLU_FREE(rwork); - SUPERLU_FREE(iwork); - SUPERLU_FREE(Bjcol.Store); - - return; - -} /* sgsrfs */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssv.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,221 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_sdefs.h" - -void -sgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, - SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * SGSSV solves the system of linear equations A*X=B, using the - * LU factorization from SGSTRF. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. Permute the columns of A, forming A*Pc, where Pc - * is a permutation matrix. For more details of this step, - * see sp_preorder.c. - * - * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined - * by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 1.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the - * above algorithm to the transpose of A: - * - * 2.1. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix. - * For more details of this step, see sp_preorder.c. - * - * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr - * determined by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 2.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR; Dtype = SLU_S; Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, column permutation vector of size A->ncol - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * If A->Stype = SLU_NR, column permutation vector of size A->nrow - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or - * options->Fact = SamePattern_SameRowPerm, it is an input argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * Otherwise, it is an output argument. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->RowPerm = MY_PERMR or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument. - * otherwise it is an output argument. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - DNformat *Bstore; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int lwork = 0, *etree, i; - - /* Set default values for some parameters */ - float drop_tol = 0.; - int panel_size; /* panel size */ - int relax; /* no of columns in a relaxed snodes */ - int permc_spec; - trans_t trans = NOTRANS; - double *utime; - double t; /* Temporary time */ - - /* Test the input parameters ... */ - *info = 0; - Bstore = B->Store; - if ( options->Fact != DOFACT ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_S || A->Mtype != SLU_GE ) - *info = -2; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) - *info = -7; - if ( *info != 0 ) { - i = -(*info); - xerbla_("sgssv", &i); - return; - } - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - trans = TRANS; - } else { - if ( A->Stype == SLU_NC ) AA = A; - } - - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t; - - etree = intMalloc(A->ncol); - - t = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t; - - panel_size = sp_ienv(1); - relax = sp_ienv(2); - - /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4));*/ - t = SuperLU_timer_(); - /* Compute the LU factorization of A. */ - sgstrf(options, &AC, drop_tol, relax, panel_size, - etree, NULL, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t; - - t = SuperLU_timer_(); - if ( *info == 0 ) { - /* Solve the system A*X=B, overwriting B with X. */ - sgstrs (trans, L, U, perm_c, perm_r, B, stat, info); - } - utime[SOLVE] = SuperLU_timer_() - t; - - SUPERLU_FREE (etree); - Destroy_CompCol_Permuted(&AC); - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssvx.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssvx.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssvx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,614 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_sdefs.h" - -void -sgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - int *etree, char *equed, float *R, float *C, - SuperMatrix *L, SuperMatrix *U, void *work, int lwork, - SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, - float *rcond, float *ferr, float *berr, - mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * SGSSVX solves the system of linear equations A*X=B or A'*X=B, using - * the LU factorization from sgstrf(). Error bounds on the solution and - * a condition estimate are also provided. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A is - * overwritten by diag(R)*A*diag(C) and B by diag(R)*B - * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans - * = TRANS or CONJ). - * - * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation - * matrix that usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 1.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the matrix A (after equilibration if options->Equil = YES) - * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. - * - * 1.4. Compute the reciprocal pivot growth factor. - * - * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form of - * A is used to estimate the condition number of the matrix A. If - * the reciprocal of the condition number is less than machine - * precision, info = A->ncol+1 is returned as a warning, but the - * routine still goes on to solve for X and computes error bounds - * as described below. - * - * 1.6. The system of equations is solved for X using the factored form - * of A. - * - * 1.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 1.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm - * to the transpose of A: - * - * 2.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A' is - * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B - * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). - * - * 2.2. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix that - * usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 2.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the transpose(A) (after equilibration if - * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the - * permutation Pr determined by partial pivoting. - * - * 2.4. Compute the reciprocal pivot growth factor. - * - * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form - * of transpose(A) is used to estimate the condition number of the - * matrix A. If the reciprocal of the condition number - * is less than machine precision, info = A->nrow+1 is returned as - * a warning, but the routine still goes on to solve for X and - * computes error bounds as described below. - * - * 2.6. The system of equations is solved for X using the factored form - * of transpose(A). - * - * 2.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 2.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input/output) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * On entry, If options->Fact = FACTORED and equed is not 'N', - * then A must have been equilibrated by the scaling factors in - * R and/or C. - * On exit, A is not modified if options->Equil = NO, or if - * options->Equil = YES but equed = 'N' on exit. - * Otherwise, if options->Equil = YES and equed is not 'N', - * A is scaled as follows: - * If A->Stype = SLU_NC: - * equed = 'R': A := diag(R) * A - * equed = 'C': A := A * diag(C) - * equed = 'B': A := diag(R) * A * diag(C). - * If A->Stype = SLU_NR: - * equed = 'R': transpose(A) := diag(R) * transpose(A) - * equed = 'C': transpose(A) := transpose(A) * diag(C) - * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * - * If A->Stype = SLU_NR, column permutation vector of size A->nrow, - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by a - * new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument. - * - * etree (input/output) int*, dimension (A->ncol) - * Elimination tree of Pc'*A'*A*Pc. - * If options->Fact != FACTORED and options->Fact != DOFACT, - * etree is an input argument, otherwise it is an output argument. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * equed (input/output) char* - * Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * If options->Fact = FACTORED, equed is an input argument, - * otherwise it is an output argument. - * - * R (input/output) float*, dimension (A->nrow) - * The row scale factors for A or transpose(A). - * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * If options->Fact = FACTORED, R is an input argument, - * otherwise, R is output. - * If options->zFact = FACTORED and equed = 'R' or 'B', each element - * of R must be positive. - * - * C (input/output) float*, dimension (A->ncol) - * The column scale factors for A or transpose(A). - * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * If options->Fact = FACTORED, C is an input argument, - * otherwise, C is output. - * If options->Fact = FACTORED and equed = 'C' or 'B', each element - * of C must be positive. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. - * - * work (workspace/output) void*, size (lwork) (in bytes) - * User supplied workspace, should be large enough - * to hold data structures for factors L and U. - * On exit, if fact is not 'F', L and U point to this array. - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * mem_usage->total_needed; no other side effects. - * - * See argument 'mem_usage' for memory usage statistics. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * If B->ncol = 0, only LU decomposition is performed, the triangular - * solve is skipped. - * On exit, - * if equed = 'N', B is not modified; otherwise - * if A->Stype = SLU_NC: - * if options->Trans = NOTRANS and equed = 'R' or 'B', - * B is overwritten by diag(R)*B; - * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', - * B is overwritten by diag(C)*B; - * if A->Stype = SLU_NR: - * if options->Trans = NOTRANS and equed = 'C' or 'B', - * B is overwritten by diag(C)*B; - * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', - * B is overwritten by diag(R)*B. - * - * X (output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. - * If info = 0 or info = A->ncol+1, X contains the solution matrix - * to the original system of equations. Note that A and B are modified - * on exit if equed is not 'N', and the solution to the equilibrated - * system is inv(diag(C))*X if options->Trans = NOTRANS and - * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' - * and equed = 'R' or 'B'. - * - * recip_pivot_growth (output) float* - * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). - * The infinity norm is used. If recip_pivot_growth is much less - * than 1, the stability of the LU factorization could be poor. - * - * rcond (output) float* - * The estimate of the reciprocal condition number of the matrix A - * after equilibration (if done). If rcond is less than the machine - * precision (in particular, if rcond = 0), the matrix is singular - * to working precision. This condition is indicated by a return - * code of info > 0. - * - * FERR (output) float*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * If options->IterRefine = NOREFINE, ferr = 1.0. - * - * BERR (output) float*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * If options->IterRefine = NOREFINE, berr = 1.0. - * - * mem_usage (output) mem_usage_t* - * Record the memory usage statistics, consisting of following fields: - * - for_lu (float) - * The amount of space used in bytes for L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * The number of memory expansions during the LU factorization. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly - * singular, so the solution and error bounds - * could not be computed. - * = A->ncol+1: U is nonsingular, but RCOND is less than machine - * precision, meaning that the matrix is singular to - * working precision. Nevertheless, the solution and - * error bounds are computed because there are a number - * of situations where the computed solution can be more - * accurate than the value of RCOND would suggest. - * > A->ncol+1: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - - DNformat *Bstore, *Xstore; - float *Bmat, *Xmat; - int ldb, ldx, nrhs; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int colequ, equil, nofact, notran, rowequ, permc_spec; - trans_t trant; - char norm[1]; - int i, j, info1; - float amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; - int relax, panel_size; - float diag_pivot_thresh, drop_tol; - double t0; /* temporary time */ - double *utime; - - /* External functions */ - extern float slangs(char *, SuperMatrix *); - extern double slamch_(char *); - - Bstore = B->Store; - Xstore = X->Store; - Bmat = Bstore->nzval; - Xmat = Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - *info = 0; - nofact = (options->Fact != FACTORED); - equil = (options->Equil == YES); - notran = (options->Trans == NOTRANS); - if ( nofact ) { - *(unsigned char *)equed = 'N'; - rowequ = FALSE; - colequ = FALSE; - } else { - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - smlnum = slamch_("Safe minimum"); - bignum = 1. / smlnum; - } - -#if 0 -printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", - options->Fact, options->Trans, *equed); -#endif - - /* Test the input parameters */ - if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && - options->Fact != SamePattern_SameRowPerm && - !notran && options->Trans != TRANS && options->Trans != CONJ && - !equil && options->Equil != NO) - *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_S || A->Mtype != SLU_GE ) - *info = -2; - else if (options->Fact == FACTORED && - !(rowequ || colequ || lsame_(equed, "N"))) - *info = -6; - else { - if (rowequ) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, R[j]); - rcmax = SUPERLU_MAX(rcmax, R[j]); - } - if (rcmin <= 0.) *info = -7; - else if ( A->nrow > 0) - rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else rowcnd = 1.; - } - if (colequ && *info == 0) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, C[j]); - rcmax = SUPERLU_MAX(rcmax, C[j]); - } - if (rcmin <= 0.) *info = -8; - else if (A->nrow > 0) - colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else colcnd = 1.; - } - if (*info == 0) { - if ( lwork < -1 ) *info = -12; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_S || - B->Mtype != SLU_GE ) - *info = -13; - else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || - (B->ncol != 0 && B->ncol != X->ncol) || - X->Stype != SLU_DN || - X->Dtype != SLU_S || X->Mtype != SLU_GE ) - *info = -14; - } - } - if (*info != 0) { - i = -(*info); - xerbla_("sgssvx", &i); - return; - } - - /* Initialization for factor parameters */ - panel_size = sp_ienv(1); - relax = sp_ienv(2); - diag_pivot_thresh = options->DiagPivotThresh; - drop_tol = 0.0; - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - if ( notran ) { /* Reverse the transpose argument. */ - trant = TRANS; - notran = 0; - } else { - trant = NOTRANS; - notran = 1; - } - } else { /* A->Stype == SLU_NC */ - trant = options->Trans; - AA = A; - } - - if ( nofact && equil ) { - t0 = SuperLU_timer_(); - /* Compute row and column scalings to equilibrate the matrix A. */ - sgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); - - if ( info1 == 0 ) { - /* Equilibrate matrix A. */ - slaqgs(AA, R, C, rowcnd, colcnd, amax, equed); - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - } - utime[EQUIL] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Scale the right hand side if equilibration was performed. */ - if ( notran ) { - if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Bmat[i + j*ldb] *= R[i]; - } - } - } else if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Bmat[i + j*ldb] *= C[i]; - } - } - } - - if ( nofact ) { - - t0 = SuperLU_timer_(); - /* - * Gnet column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t0; - - t0 = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t0; - -/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4)); - fflush(stdout); */ - - /* Compute the LU factorization of A*Pc. */ - t0 = SuperLU_timer_(); - sgstrf(options, &AC, drop_tol, relax, panel_size, - etree, work, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t0; - - if ( lwork == -1 ) { - mem_usage->total_needed = *info - A->ncol; - return; - } - } - - if ( options->PivotGrowth ) { - if ( *info > 0 ) { - if ( *info <= A->ncol ) { - /* Compute the reciprocal pivot growth factor of the leading - rank-deficient *info columns of A. */ - *recip_pivot_growth = sPivotGrowth(*info, AA, perm_c, L, U); - } - return; - } - - /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ - *recip_pivot_growth = sPivotGrowth(A->ncol, AA, perm_c, L, U); - } - - if ( options->ConditionNumber ) { - /* Estimate the reciprocal of the condition number of A. */ - t0 = SuperLU_timer_(); - if ( notran ) { - *(unsigned char *)norm = '1'; - } else { - *(unsigned char *)norm = 'I'; - } - anorm = slangs(norm, AA); - sgscon(norm, L, U, anorm, rcond, stat, info); - utime[RCOND] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Compute the solution matrix X. */ - for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ - for (i = 0; i < B->nrow; i++) - Xmat[i + j*ldx] = Bmat[i + j*ldb]; - - t0 = SuperLU_timer_(); - sgstrs (trant, L, U, perm_c, perm_r, X, stat, info); - utime[SOLVE] = SuperLU_timer_() - t0; - - /* Use iterative refinement to improve the computed solution and compute - error bounds and backward error estimates for it. */ - t0 = SuperLU_timer_(); - if ( options->IterRefine != NOREFINE ) { - sgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, - X, ferr, berr, stat, info); - } else { - for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; - } - utime[REFINE] = SuperLU_timer_() - t0; - - /* Transform the solution matrix X to a solution of the original system. */ - if ( notran ) { - if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Xmat[i + j*ldx] *= C[i]; - } - } - } else if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - Xmat[i + j*ldx] *= R[i]; - } - } - } /* end if nrhs > 0 */ - - if ( options->ConditionNumber ) { - /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ - if ( *rcond < slamch_("E") ) *info = A->ncol + 1; - } - - if ( nofact ) { - sQuerySpace(L, U, mem_usage); - Destroy_CompCol_Permuted(&AC); - } - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrf.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,433 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - -void -sgstrf (superlu_options_t *options, SuperMatrix *A, float drop_tol, - int relax, int panel_size, int *etree, void *work, int lwork, - int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * SGSTRF computes an LU factorization of a general sparse m-by-n - * matrix A using partial pivoting with row interchanges. - * The factorization has the form - * Pr * A = L * U - * where Pr is a row permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper - * triangular (upper trapezoidal if A->nrow < A->ncol). - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. - * - * drop_tol (input) float (NOT IMPLEMENTED) - * Drop tolerance parameter. At step j of the Gaussian elimination, - * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. - * 0 <= drop_tol <= 1. The default value of drop_tol is 0. - * - * relax (input) int - * To control degree of relaxing supernodes. If the number - * of nodes (columns) in a subtree of the elimination tree is less - * than relax, this subtree is considered as one supernode, - * regardless of the row structures of those columns. - * - * panel_size (input) int - * A panel consists of at most panel_size consecutive columns. - * - * etree (input) int*, dimension (A->ncol) - * Elimination tree of A'*A. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * On input, the columns of A should be permuted so that the - * etree is in a certain postorder. - * - * work (input/output) void*, size (lwork) (in bytes) - * User-supplied work space and space for the output data structures. - * Not referenced if lwork = 0; - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * *info; no other side effects. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * When searching for diagonal, perm_c[*] is applied to the - * row subscripts of A, so that diagonal threshold pivoting - * can find the diagonal of A, rather than that of A*Pc. - * - * perm_r (input/output) int*, dimension (A->nrow) - * Row permutation vector which defines the permutation matrix Pr, - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by - * a new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument; - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = SLU_NC, - * Dtype = SLU_S, Mtype = SLU_TRU. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. If lwork = -1, it is - * the estimated amount of space needed, plus A->ncol. - * - * ====================================================================== - * - * Local Working Arrays: - * ====================== - * m = number of rows in the matrix - * n = number of columns in the matrix - * - * xprune[0:n-1]: xprune[*] points to locations in subscript - * vector lsub[*]. For column i, xprune[i] denotes the point where - * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need - * to be traversed for symbolic factorization. - * - * marker[0:3*m-1]: marker[i] = j means that node i has been - * reached when working on column j. - * Storage: relative to original row subscripts - * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, - * see spanel_dfs.c; marker2 is used for inner-factorization, - * see scolumn_dfs.c. - * - * parent[0:m-1]: parent vector used during dfs - * Storage: relative to new row subscripts - * - * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) - * unexplored neighbor of i in lsub[*] - * - * segrep[0:nseg-1]: contains the list of supernodal representatives - * in topological order of the dfs. A supernode representative is the - * last column of a supernode. - * The maximum size of segrep[] is n. - * - * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a - * supernodal representative r, repfnz[r] is the location of the first - * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 - * indicates the supernode r has been explored. - * NOTE: There are W of them, each used for one column of a panel. - * - * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below - * the panel diagonal. These are filled in during spanel_dfs(), and are - * used later in the inner LU factorization within the panel. - * panel_lsub[]/dense[] pair forms the SPA data structure. - * NOTE: There are W of them. - * - * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; - * NOTE: there are W of them. - * - * tempv[0:*]: real temporary used for dense numeric kernels; - * The size of this array is defined by NUM_TEMPV() in ssp_defs.h. - * - */ - /* Local working arrays */ - NCPformat *Astore; - int *iperm_r = NULL; /* inverse of perm_r; used when - options->Fact == SamePattern_SameRowPerm */ - int *iperm_c; /* inverse of perm_c */ - int *iwork; - float *swork; - int *segrep, *repfnz, *parent, *xplore; - int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ - int *xprune; - int *marker; - float *dense, *tempv; - int *relax_end; - float *a; - int *asub; - int *xa_begin, *xa_end; - int *xsup, *supno; - int *xlsub, *xlusup, *xusub; - int nzlumax; - static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ - - /* Local scalars */ - fact_t fact = options->Fact; - double diag_pivot_thresh = options->DiagPivotThresh; - int pivrow; /* pivotal row number in the original matrix A */ - int nseg1; /* no of segments in U-column above panel row jcol */ - int nseg; /* no of segments in each U-column */ - register int jcol; - register int kcol; /* end column of a relaxed snode */ - register int icol; - register int i, k, jj, new_next, iinfo; - int m, n, min_mn, jsupno, fsupc, nextlu, nextu; - int w_def; /* upper bound on panel width */ - int usepr, iperm_r_allocated = 0; - int nnzL, nnzU; - int *panel_histo = stat->panel_histo; - flops_t *ops = stat->ops; - - iinfo = 0; - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN(m, n); - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - - /* Allocate storage common to the factor routines */ - *info = sLUMemInit(fact, work, lwork, m, n, Astore->nnz, - panel_size, L, U, &Glu, &iwork, &swork); - if ( *info ) return; - - xsup = Glu.xsup; - supno = Glu.supno; - xlsub = Glu.xlsub; - xlusup = Glu.xlusup; - xusub = Glu.xusub; - - SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, - &repfnz, &panel_lsub, &xprune, &marker); - sSetRWork(m, panel_size, swork, &dense, &tempv); - - usepr = (fact == SamePattern_SameRowPerm); - if ( usepr ) { - /* Compute the inverse of perm_r */ - iperm_r = (int *) intMalloc(m); - for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; - iperm_r_allocated = 1; - } - iperm_c = (int *) intMalloc(n); - for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; - - /* Identify relaxed snodes */ - relax_end = (int *) intMalloc(n); - if ( options->SymmetricMode == YES ) { - heap_relax_snode(n, etree, relax, marker, relax_end); - } else { - relax_snode(n, etree, relax, marker, relax_end); - } - - ifill (perm_r, m, EMPTY); - ifill (marker, m * NO_MARKER, EMPTY); - supno[0] = -1; - xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; - w_def = panel_size; - - /* - * Work on one "panel" at a time. A panel is one of the following: - * (a) a relaxed supernode at the bottom of the etree, or - * (b) panel_size contiguous columns, defined by the user - */ - for (jcol = 0; jcol < min_mn; ) { - - if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ - kcol = relax_end[jcol]; /* end of the relaxed snode */ - panel_histo[kcol-jcol+1]++; - - /* -------------------------------------- - * Factorize the relaxed supernode(jcol:kcol) - * -------------------------------------- */ - /* Determine the union of the row structure of the snode */ - if ( (*info = ssnode_dfs(jcol, kcol, asub, xa_begin, xa_end, - xprune, marker, &Glu)) != 0 ) - return; - - nextu = xusub[jcol]; - nextlu = xlusup[jcol]; - jsupno = supno[jcol]; - fsupc = xsup[jsupno]; - new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); - nzlumax = Glu.nzlumax; - while ( new_next > nzlumax ) { - if ( (*info = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) - return; - } - - for (icol = jcol; icol<= kcol; icol++) { - xusub[icol+1] = nextu; - - /* Scatter into SPA dense[*] */ - for (k = xa_begin[icol]; k < xa_end[icol]; k++) - dense[asub[k]] = a[k]; - - /* Numeric update within the snode */ - ssnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); - - if ( (*info = spivotL(icol, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - -#ifdef DEBUG - sprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); -#endif - - } - - jcol = icol; - - } else { /* Work on one panel of panel_size columns */ - - /* Adjust panel_size so that a panel won't overlap with the next - * relaxed snode. - */ - panel_size = w_def; - for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) - if ( relax_end[k] != EMPTY ) { - panel_size = k - jcol; - break; - } - if ( k == min_mn ) panel_size = min_mn - jcol; - panel_histo[panel_size]++; - - /* symbolic factor on a panel of columns */ - spanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, - dense, panel_lsub, segrep, repfnz, xprune, - marker, parent, xplore, &Glu); - - /* numeric sup-panel updates in topological order */ - spanel_bmod(m, panel_size, jcol, nseg1, dense, - tempv, segrep, repfnz, &Glu, stat); - - /* Sparse LU within the panel, and below panel diagonal */ - for ( jj = jcol; jj < jcol + panel_size; jj++) { - k = (jj - jcol) * m; /* column index for w-wide arrays */ - - nseg = nseg1; /* Begin after all the panel segments */ - - if ((*info = scolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], - segrep, &repfnz[k], xprune, marker, - parent, xplore, &Glu)) != 0) return; - - /* Numeric updates */ - if ((*info = scolumn_bmod(jj, (nseg - nseg1), &dense[k], - tempv, &segrep[nseg1], &repfnz[k], - jcol, &Glu, stat)) != 0) return; - - /* Copy the U-segments to ucol[*] */ - if ((*info = scopy_to_ucol(jj, nseg, segrep, &repfnz[k], - perm_r, &dense[k], &Glu)) != 0) - return; - - if ( (*info = spivotL(jj, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - - /* Prune columns (0:jj-1) using column jj */ - spruneL(jj, perm_r, pivrow, nseg, segrep, - &repfnz[k], xprune, &Glu); - - /* Reset repfnz[] for this column */ - resetrep_col (nseg, segrep, &repfnz[k]); - -#ifdef DEBUG - sprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); -#endif - - } - - jcol += panel_size; /* Move to the next panel */ - - } /* else */ - - } /* for */ - - *info = iinfo; - - if ( m > n ) { - k = 0; - for (i = 0; i < m; ++i) - if ( perm_r[i] == EMPTY ) { - perm_r[i] = n + k; - ++k; - } - } - - countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); - fixupL(min_mn, perm_r, &Glu); - - sLUWorkFree(iwork, swork, &Glu); /* Free work space and compress storage */ - - if ( fact == SamePattern_SameRowPerm ) { - /* L and U structures may have changed due to possibly different - pivoting, even though the storage is available. - There could also be memory expansions, so the array locations - may have changed, */ - ((SCformat *)L->Store)->nnz = nnzL; - ((SCformat *)L->Store)->nsuper = Glu.supno[n]; - ((SCformat *)L->Store)->nzval = Glu.lusup; - ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; - ((SCformat *)L->Store)->rowind = Glu.lsub; - ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; - ((NCformat *)U->Store)->nnz = nnzU; - ((NCformat *)U->Store)->nzval = Glu.ucol; - ((NCformat *)U->Store)->rowind = Glu.usub; - ((NCformat *)U->Store)->colptr = Glu.xusub; - } else { - sCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, - Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, - Glu.xsup, SLU_SC, SLU_S, SLU_TRLU); - sCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, - Glu.usub, Glu.xusub, SLU_NC, SLU_S, SLU_TRU); - } - - ops[FACT] += ops[TRSV] + ops[GEMV]; - - if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); - SUPERLU_FREE (iperm_c); - SUPERLU_FREE (relax_end); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,332 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - - -/* - * Function prototypes - */ -void susolve(int, int, float*, float*); -void slsolve(int, int, float*, float*); -void smatvec(int, int, int, float*, float*, float*); - - -void -sgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, SuperMatrix *B, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * SGSTRS solves a system of linear equations A*X=B or A'*X=B - * with A sparse and B dense, using the LU factorization computed by - * SGSTRF. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U as computed by - * sgstrf(). Use compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * sgstrf(). Use column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (L->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (L->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ -#ifdef _CRAY - _fcd ftcs1, ftcs2, ftcs3, ftcs4; -#endif - int incx = 1, incy = 1; -#ifdef USE_VENDOR_BLAS - float alpha = 1.0, beta = 1.0; - float *work_col; -#endif - DNformat *Bstore; - float *Bmat; - SCformat *Lstore; - NCformat *Ustore; - float *Lval, *Uval; - int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; - int i, j, k, iptr, jcol, n, ldb, nrhs; - float *work, *rhs_work, *soln; - flops_t solve_ops; - void sprint_soln(); - - /* Test input parameters ... */ - *info = 0; - Bstore = B->Store; - ldb = Bstore->lda; - nrhs = B->ncol; - if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU ) - *info = -2; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU ) - *info = -3; - else if ( ldb < SUPERLU_MAX(0, L->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) - *info = -6; - if ( *info ) { - i = -(*info); - xerbla_("sgstrs", &i); - return; - } - - n = L->nrow; - work = floatCalloc(n * nrhs); - if ( !work ) ABORT("Malloc fails for local work[]."); - soln = floatMalloc(n); - if ( !soln ) ABORT("Malloc fails for local soln[]."); - - Bmat = Bstore->nzval; - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( trans == NOTRANS ) { - /* Permute right hand sides to form Pr*B */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - /* Forward solve PLy=Pb. */ - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - nrow = nsupr - nsupc; - - solve_ops += nsupc * (nsupc - 1) * nrhs; - solve_ops += 2 * nrow * nsupc * nrhs; - - if ( nsupc == 1 ) { - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - luptr = L_NZ_START(fsupc); - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ - irow = L_SUB(iptr); - ++luptr; - rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; - } - } - } else { - luptr = L_NZ_START(fsupc); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#else - strsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - sgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#endif - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - work_col = &work[j*n]; - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - rhs_work[irow] -= work_col[i]; /* Scatter */ - work_col[i] = 0.0; - iptr++; - } - } -#else - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - slsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); - smatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], - &rhs_work[fsupc], &work[0] ); - - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - rhs_work[irow] -= work[i]; - work[i] = 0.0; - iptr++; - } - } -#endif - } /* else ... */ - } /* for L-solve */ - -#ifdef DEBUG - printf("After L-solve: y=\n"); - sprint_soln(n, nrhs, Bmat); -#endif - - /* - * Back solve Ux=y. - */ - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += nsupc * (nsupc + 1) * nrhs; - - if ( nsupc == 1 ) { - rhs_work = &Bmat[0]; - for (j = 0; j < nrhs; j++) { - rhs_work[fsupc] /= Lval[luptr]; - rhs_work += ldb; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("U", strlen("U")); - ftcs3 = _cptofcd("N", strlen("N")); - STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#else - strsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#endif -#else - for (j = 0; j < nrhs; j++) - susolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); -#endif - } - - for (j = 0; j < nrhs; ++j) { - rhs_work = &Bmat[j*ldb]; - for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ - irow = U_SUB(i); - rhs_work[irow] -= rhs_work[jcol] * Uval[i]; - } - } - } - - } /* for U-solve */ - -#ifdef DEBUG - printf("After U-solve: x=\n"); - sprint_soln(n, nrhs, Bmat); -#endif - - /* Compute the final solution X := Pc*X. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = solve_ops; - - } else { /* Solve A'*X=B or CONJ(A)*X=B */ - /* Permute right hand sides to form Pc'*B. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = 0; - for (k = 0; k < nrhs; ++k) { - - /* Multiply by inv(U'). */ - sp_strsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); - - /* Multiply by inv(L'). */ - sp_strsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); - - } - /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - } - - SUPERLU_FREE(work); - SUPERLU_FREE(soln); -} - -/* - * Diagnostic print of the solution vector - */ -void -sprint_soln(int n, int nrhs, float *soln) -{ - int i; - - for (i = 0; i < n; i++) - printf("\t%d: %.4f\n", i, soln[i]); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slacon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slacon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slacon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,228 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include "slu_Cnames.h" - -int -slacon_(int *n, float *v, float *x, int *isgn, float *est, int *kase) - -{ -/* - Purpose - ======= - - SLACON estimates the 1-norm of a square matrix A. - Reverse communication is used for evaluating matrix-vector products. - - - Arguments - ========= - - N (input) INT - The order of the matrix. N >= 1. - - V (workspace) FLOAT PRECISION array, dimension (N) - On the final return, V = A*W, where EST = norm(V)/norm(W) - (W is not returned). - - X (input/output) FLOAT PRECISION array, dimension (N) - On an intermediate return, X should be overwritten by - A * X, if KASE=1, - A' * X, if KASE=2, - and SLACON must be re-called with all the other parameters - unchanged. - - ISGN (workspace) INT array, dimension (N) - - EST (output) FLOAT PRECISION - An estimate (a lower bound) for norm(A). - - KASE (input/output) INT - On the initial call to SLACON, KASE should be 0. - On an intermediate return, KASE will be 1 or 2, indicating - whether X should be overwritten by A * X or A' * X. - On the final return from SLACON, KASE will again be 0. - - Further Details - ======= ======= - - Contributed by Nick Higham, University of Manchester. - Originally named CONEST, dated March 16, 1988. - - Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of - a real or complex matrix, with applications to condition estimation", - ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. - ===================================================================== -*/ - - /* Table of constant values */ - int c__1 = 1; - float zero = 0.0; - float one = 1.0; - - /* Local variables */ - static int iter; - static int jump, jlast; - static float altsgn, estold; - static int i, j; - float temp; -#ifdef _CRAY - extern int ISAMAX(int *, float *, int *); - extern float SASUM(int *, float *, int *); - extern int SCOPY(int *, float *, int *, float *, int *); -#else - extern int isamax_(int *, float *, int *); - extern float sasum_(int *, float *, int *); - extern int scopy_(int *, float *, int *, float *, int *); -#endif -#define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a)) /* Copy sign */ -#define i_dnnt(a) \ - ( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */ - - if ( *kase == 0 ) { - for (i = 0; i < *n; ++i) { - x[i] = 1. / (float) (*n); - } - *kase = 1; - jump = 1; - return 0; - } - - switch (jump) { - case 1: goto L20; - case 2: goto L40; - case 3: goto L70; - case 4: goto L110; - case 5: goto L140; - } - - /* ................ ENTRY (JUMP = 1) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ - L20: - if (*n == 1) { - v[0] = x[0]; - *est = fabs(v[0]); - /* ... QUIT */ - goto L150; - } -#ifdef _CRAY - *est = SASUM(n, x, &c__1); -#else - *est = sasum_(n, x, &c__1); -#endif - - for (i = 0; i < *n; ++i) { - x[i] = d_sign(one, x[i]); - isgn[i] = i_dnnt(x[i]); - } - *kase = 2; - jump = 2; - return 0; - - /* ................ ENTRY (JUMP = 2) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ -L40: -#ifdef _CRAY - j = ISAMAX(n, &x[0], &c__1); -#else - j = isamax_(n, &x[0], &c__1); -#endif - --j; - iter = 2; - - /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ -L50: - for (i = 0; i < *n; ++i) x[i] = zero; - x[j] = one; - *kase = 1; - jump = 3; - return 0; - - /* ................ ENTRY (JUMP = 3) - X HAS BEEN OVERWRITTEN BY A*X. */ -L70: -#ifdef _CRAY - SCOPY(n, x, &c__1, v, &c__1); -#else - scopy_(n, x, &c__1, v, &c__1); -#endif - estold = *est; -#ifdef _CRAY - *est = SASUM(n, v, &c__1); -#else - *est = sasum_(n, v, &c__1); -#endif - - for (i = 0; i < *n; ++i) - if (i_dnnt(d_sign(one, x[i])) != isgn[i]) - goto L90; - - /* REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */ - goto L120; - -L90: - /* TEST FOR CYCLING. */ - if (*est <= estold) goto L120; - - for (i = 0; i < *n; ++i) { - x[i] = d_sign(one, x[i]); - isgn[i] = i_dnnt(x[i]); - } - *kase = 2; - jump = 4; - return 0; - - /* ................ ENTRY (JUMP = 4) - X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ -L110: - jlast = j; -#ifdef _CRAY - j = ISAMAX(n, &x[0], &c__1); -#else - j = isamax_(n, &x[0], &c__1); -#endif - --j; - if (x[jlast] != fabs(x[j]) && iter < 5) { - ++iter; - goto L50; - } - - /* ITERATION COMPLETE. FINAL STAGE. */ -L120: - altsgn = 1.; - for (i = 1; i <= *n; ++i) { - x[i-1] = altsgn * ((float)(i - 1) / (float)(*n - 1) + 1.); - altsgn = -altsgn; - } - *kase = 1; - jump = 5; - return 0; - - /* ................ ENTRY (JUMP = 5) - X HAS BEEN OVERWRITTEN BY A*X. */ -L140: -#ifdef _CRAY - temp = SASUM(n, x, &c__1) / (float)(*n * 3) * 2.; -#else - temp = sasum_(n, x, &c__1) / (float)(*n * 3) * 2.; -#endif - if (temp > *est) { -#ifdef _CRAY - SCOPY(n, &x[0], &c__1, &v[0], &c__1); -#else - scopy_(n, &x[0], &c__1, &v[0], &c__1); -#endif - *est = temp; - } - -L150: - *kase = 0; - return 0; - -} /* slacon_ */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slamch.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slamch.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slamch.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,984 +0,0 @@ -#include -#include "slu_Cnames.h" - -#define TRUE_ (1) -#define FALSE_ (0) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) -#define abs(x) ((x) >= 0 ? (x) : -(x)) -#define dabs(x) (double)abs(x) - -double slamch_(char *cmach) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMCH determines single precision machine parameters. - - Arguments - ========= - - CMACH (input) CHARACTER*1 - Specifies the value to be returned by SLAMCH: - = 'E' or 'e', SLAMCH := eps - = 'S' or 's , SLAMCH := sfmin - = 'B' or 'b', SLAMCH := base - = 'P' or 'p', SLAMCH := eps*base - = 'N' or 'n', SLAMCH := t - = 'R' or 'r', SLAMCH := rnd - = 'M' or 'm', SLAMCH := emin - = 'U' or 'u', SLAMCH := rmin - = 'L' or 'l', SLAMCH := emax - = 'O' or 'o', SLAMCH := rmax - - where - - eps = relative machine precision - sfmin = safe minimum, such that 1/sfmin does not overflow - base = base of the machine - prec = eps*base - t = number of (base) digits in the mantissa - rnd = 1.0 when rounding occurs in addition, 0.0 otherwise - emin = minimum exponent before (gradual) underflow - rmin = underflow threshold - base**(emin-1) - emax = largest exponent before overflow - rmax = overflow threshold - (base**emax)*(1-eps) - - ===================================================================== -*/ -/* >>Start of File<< - Initialized data */ - static int first = TRUE_; - /* System generated locals */ - int i__1; - float ret_val; - /* Builtin functions */ - double pow_ri(float *, int *); - /* Local variables */ - static float base; - static int beta; - static float emin, prec, emax; - static int imin, imax; - static int lrnd; - static float rmin, rmax, t, rmach; - extern int lsame_(char *, char *); - static float small, sfmin; - extern /* Subroutine */ int slamc2_(int *, int *, int *, float - *, int *, float *, int *, float *); - static int it; - static float rnd, eps; - - - - if (first) { - first = FALSE_; - slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); - base = (float) beta; - t = (float) it; - if (lrnd) { - rnd = 1.f; - i__1 = 1 - it; - eps = pow_ri(&base, &i__1) / 2; - } else { - rnd = 0.f; - i__1 = 1 - it; - eps = pow_ri(&base, &i__1); - } - prec = eps * base; - emin = (float) imin; - emax = (float) imax; - sfmin = rmin; - small = 1.f / rmax; - if (small >= sfmin) { - -/* Use SMALL plus a bit, to avoid the possibility of rou -nding - causing overflow when computing 1/sfmin. */ - - sfmin = small * (eps + 1.f); - } - } - - if (lsame_(cmach, "E")) { - rmach = eps; - } else if (lsame_(cmach, "S")) { - rmach = sfmin; - } else if (lsame_(cmach, "B")) { - rmach = base; - } else if (lsame_(cmach, "P")) { - rmach = prec; - } else if (lsame_(cmach, "N")) { - rmach = t; - } else if (lsame_(cmach, "R")) { - rmach = rnd; - } else if (lsame_(cmach, "M")) { - rmach = emin; - } else if (lsame_(cmach, "U")) { - rmach = rmin; - } else if (lsame_(cmach, "L")) { - rmach = emax; - } else if (lsame_(cmach, "O")) { - rmach = rmax; - } - - ret_val = rmach; - return ret_val; - -/* End of SLAMCH */ - -} /* slamch_ */ - - -/* Subroutine */ int slamc1_(int *beta, int *t, int *rnd, int - *ieee1) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC1 determines the machine parameters given by BETA, T, RND, and - IEEE1. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - IEEE1 (output) INT - Specifies whether rounding appears to be done in the IEEE - 'round to nearest' style. - - Further Details - =============== - - The routine is based on the routine ENVRON by Malcolm and - incorporates suggestions by Gentleman and Marovich. See - - Malcolm M. A. (1972) Algorithms to reveal properties of - floating-point arithmetic. Comms. of the ACM, 15, 949-951. - - Gentleman W. M. and Marovich S. B. (1974) More on algorithms - that reveal properties of floating point arithmetic units. - Comms. of the ACM, 17, 276-277. - - ===================================================================== -*/ - /* Initialized data */ - static int first = TRUE_; - /* System generated locals */ - float r__1, r__2; - /* Local variables */ - static int lrnd; - static float a, b, c, f; - static int lbeta; - static float savec; - static int lieee1; - static float t1, t2; - extern double slamc3_(float *, float *); - static int lt; - static float one, qtr; - - - - if (first) { - first = FALSE_; - one = 1.f; - -/* LBETA, LIEEE1, LT and LRND are the local values of BE -TA, - IEEE1, T and RND. - - Throughout this routine we use the function SLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - Compute a = 2.0**m with the smallest positive integer m s -uch - that - - fl( a + 1.0 ) = a. */ - - a = 1.f; - c = 1.f; - -/* + WHILE( C.EQ.ONE )LOOP */ -L10: - if (c == one) { - a *= 2; - c = slamc3_(&a, &one); - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - goto L10; - } -/* + END WHILE - - Now compute b = 2.0**m with the smallest positive integer -m - such that - - fl( a + b ) .gt. a. */ - - b = 1.f; - c = slamc3_(&a, &b); - -/* + WHILE( C.EQ.A )LOOP */ -L20: - if (c == a) { - b *= 2; - c = slamc3_(&a, &b); - goto L20; - } -/* + END WHILE - - Now compute the base. a and c are neighbouring floating po -int - numbers in the interval ( beta**t, beta**( t + 1 ) ) and - so - their difference is beta. Adding 0.25 to c is to ensure that - it - is truncated to beta and not ( beta - 1 ). */ - - qtr = one / 4; - savec = c; - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - lbeta = c + qtr; - -/* Now determine whether rounding or chopping occurs, by addin -g a - bit less than beta/2 and a bit more than beta/2 to - a. */ - - b = (float) lbeta; - r__1 = b / 2; - r__2 = -(double)b / 100; - f = slamc3_(&r__1, &r__2); - c = slamc3_(&f, &a); - if (c == a) { - lrnd = TRUE_; - } else { - lrnd = FALSE_; - } - r__1 = b / 2; - r__2 = b / 100; - f = slamc3_(&r__1, &r__2); - c = slamc3_(&f, &a); - if (lrnd && c == a) { - lrnd = FALSE_; - } - -/* Try and decide whether rounding is done in the IEEE 'round - to - nearest' style. B/2 is half a unit in the last place of the -two - numbers A and SAVEC. Furthermore, A is even, i.e. has last -bit - zero, and SAVEC is odd. Thus adding B/2 to A should not cha -nge - A, but adding B/2 to SAVEC should change SAVEC. */ - - r__1 = b / 2; - t1 = slamc3_(&r__1, &a); - r__1 = b / 2; - t2 = slamc3_(&r__1, &savec); - lieee1 = t1 == a && t2 > savec && lrnd; - -/* Now find the mantissa, t. It should be the integer part - of - log to the base beta of a, however it is safer to determine - t - by powering. So we find t as the smallest positive integer -for - which - - fl( beta**t + 1.0 ) = 1.0. */ - - lt = 0; - a = 1.f; - c = 1.f; - -/* + WHILE( C.EQ.ONE )LOOP */ -L30: - if (c == one) { - ++lt; - a *= lbeta; - c = slamc3_(&a, &one); - r__1 = -(double)a; - c = slamc3_(&c, &r__1); - goto L30; - } -/* + END WHILE */ - - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *ieee1 = lieee1; - return 0; - -/* End of SLAMC1 */ - -} /* slamc1_ */ - - -/* Subroutine */ int slamc2_(int *beta, int *t, int *rnd, float * - eps, int *emin, float *rmin, int *emax, float *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC2 determines the machine parameters specified in its argument - list. - - Arguments - ========= - - BETA (output) INT - The base of the machine. - - T (output) INT - The number of ( BETA ) digits in the mantissa. - - RND (output) INT - Specifies whether proper rounding ( RND = .TRUE. ) or - chopping ( RND = .FALSE. ) occurs in addition. This may not - - be a reliable guide to the way in which the machine performs - - its arithmetic. - - EPS (output) FLOAT - The smallest positive number such that - - fl( 1.0 - EPS ) .LT. 1.0, - - where fl denotes the computed value. - - EMIN (output) INT - The minimum exponent before (gradual) underflow occurs. - - RMIN (output) FLOAT - The smallest normalized number for the machine, given by - BASE**( EMIN - 1 ), where BASE is the floating point value - - of BETA. - - EMAX (output) INT - The maximum exponent before overflow occurs. - - RMAX (output) FLOAT - The largest positive number for the machine, given by - BASE**EMAX * ( 1 - EPS ), where BASE is the floating point - - value of BETA. - - Further Details - =============== - - The computation of EPS is based on a routine PARANOIA by - W. Kahan of the University of California at Berkeley. - - ===================================================================== -*/ - /* Table of constant values */ - static int c__1 = 1; - - /* Initialized data */ - static int first = TRUE_; - static int iwarn = FALSE_; - /* System generated locals */ - int i__1; - float r__1, r__2, r__3, r__4, r__5; - /* Builtin functions */ - double pow_ri(float *, int *); - /* Local variables */ - static int ieee; - static float half; - static int lrnd; - static float leps, zero, a, b, c; - static int i, lbeta; - static float rbase; - static int lemin, lemax, gnmin; - static float small; - static int gpmin; - static float third, lrmin, lrmax, sixth; - static int lieee1; - extern /* Subroutine */ int slamc1_(int *, int *, int *, - int *); - extern double slamc3_(float *, float *); - extern /* Subroutine */ int slamc4_(int *, float *, int *), - slamc5_(int *, int *, int *, int *, int *, - float *); - static int lt, ngnmin, ngpmin; - static float one, two; - - - - if (first) { - first = FALSE_; - zero = 0.f; - one = 1.f; - two = 2.f; - -/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values - of - BETA, T, RND, EPS, EMIN and RMIN. - - Throughout this routine we use the function SLAMC3 to ens -ure - that relevant values are stored and not held in registers, - or - are not affected by optimizers. - - SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. -*/ - - slamc1_(&lbeta, <, &lrnd, &lieee1); - -/* Start to find EPS. */ - - b = (float) lbeta; - i__1 = -lt; - a = pow_ri(&b, &i__1); - leps = a; - -/* Try some tricks to see whether or not this is the correct E -PS. */ - - b = two / 3; - half = one / 2; - r__1 = -(double)half; - sixth = slamc3_(&b, &r__1); - third = slamc3_(&sixth, &sixth); - r__1 = -(double)half; - b = slamc3_(&third, &r__1); - b = slamc3_(&b, &sixth); - b = dabs(b); - if (b < leps) { - b = leps; - } - - leps = 1.f; - -/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ -L10: - if (leps > b && b > zero) { - leps = b; - r__1 = half * leps; -/* Computing 5th power */ - r__3 = two, r__4 = r__3, r__3 *= r__3; -/* Computing 2nd power */ - r__5 = leps; - r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5); - c = slamc3_(&r__1, &r__2); - r__1 = -(double)c; - c = slamc3_(&half, &r__1); - b = slamc3_(&half, &c); - r__1 = -(double)b; - c = slamc3_(&half, &r__1); - b = slamc3_(&half, &c); - goto L10; - } -/* + END WHILE */ - - if (a < leps) { - leps = a; - } - -/* Computation of EPS complete. - - Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 -)). - Keep dividing A by BETA until (gradual) underflow occurs. T -his - is detected when we cannot recover the previous A. */ - - rbase = one / lbeta; - small = one; - for (i = 1; i <= 3; ++i) { - r__1 = small * rbase; - small = slamc3_(&r__1, &zero); -/* L20: */ - } - a = slamc3_(&one, &small); - slamc4_(&ngpmin, &one, &lbeta); - r__1 = -(double)one; - slamc4_(&ngnmin, &r__1, &lbeta); - slamc4_(&gpmin, &a, &lbeta); - r__1 = -(double)a; - slamc4_(&gnmin, &r__1, &lbeta); - ieee = FALSE_; - - if (ngpmin == ngnmin && gpmin == gnmin) { - if (ngpmin == gpmin) { - lemin = ngpmin; -/* ( Non twos-complement machines, no gradual under -flow; - e.g., VAX ) */ - } else if (gpmin - ngpmin == 3) { - lemin = ngpmin - 1 + lt; - ieee = TRUE_; -/* ( Non twos-complement machines, with gradual und -erflow; - e.g., IEEE standard followers ) */ - } else { - lemin = min(ngpmin,gpmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if (ngpmin == gpmin && ngnmin == gnmin) { - if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { - lemin = max(ngpmin,ngnmin); -/* ( Twos-complement machines, no gradual underflow -; - e.g., CYBER 205 ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) - { - if (gpmin - min(ngpmin,ngnmin) == 3) { - lemin = max(ngpmin,ngnmin) - 1 + lt; -/* ( Twos-complement machines with gradual underflo -w; - no known machine ) */ - } else { - lemin = min(ngpmin,ngnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } - - } else { -/* Computing MIN */ - i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); - lemin = min(i__1,gnmin); -/* ( A guess; no known machine ) */ - iwarn = TRUE_; - } -/* ** - Comment out this if block if EMIN is ok */ - if (iwarn) { - first = TRUE_; - printf("\n\n WARNING. The value EMIN may be incorrect:- "); - printf("EMIN = %8i\n",lemin); - printf("If, after inspection, the value EMIN looks acceptable"); - printf("please comment out \n the IF block as marked within the"); - printf("code of routine SLAMC2, \n otherwise supply EMIN"); - printf("explicitly.\n"); - } -/* ** - - Assume IEEE arithmetic if we found denormalised numbers abo -ve, - or if arithmetic seems to round in the IEEE style, determi -ned - in routine SLAMC1. A true IEEE machine should have both thi -ngs - true; however, faulty machines may have one or the other. */ - - ieee = ieee || lieee1; - -/* Compute RMIN by successive division by BETA. We could comp -ute - RMIN as BASE**( EMIN - 1 ), but some machines underflow dur -ing - this computation. */ - - lrmin = 1.f; - i__1 = 1 - lemin; - for (i = 1; i <= 1-lemin; ++i) { - r__1 = lrmin * rbase; - lrmin = slamc3_(&r__1, &zero); -/* L30: */ - } - -/* Finally, call SLAMC5 to compute EMAX and RMAX. */ - - slamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); - } - - *beta = lbeta; - *t = lt; - *rnd = lrnd; - *eps = leps; - *emin = lemin; - *rmin = lrmin; - *emax = lemax; - *rmax = lrmax; - - return 0; - - -/* End of SLAMC2 */ - -} /* slamc2_ */ - - -double slamc3_(float *a, float *b) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC3 is intended to force A and B to be stored prior to doing - - the addition of A and B , for use in situations where optimizers - - might hold one of these in a register. - - Arguments - ========= - - A, B (input) FLOAT - The values A and B. - - ===================================================================== -*/ -/* >>Start of File<< - System generated locals */ - float ret_val; - - - - ret_val = *a + *b; - - return ret_val; - -/* End of SLAMC3 */ - -} /* slamc3_ */ - - -/* Subroutine */ int slamc4_(int *emin, float *start, int *base) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC4 is a service routine for SLAMC2. - - Arguments - ========= - - EMIN (output) EMIN - The minimum exponent before (gradual) underflow, computed by - - setting A = START and dividing by BASE until the previous A - can not be recovered. - - START (input) FLOAT - The starting point for determining EMIN. - - BASE (input) INT - The base of the machine. - - ===================================================================== -*/ - /* System generated locals */ - int i__1; - float r__1; - /* Local variables */ - static float zero, a; - static int i; - static float rbase, b1, b2, c1, c2, d1, d2; - extern double slamc3_(float *, float *); - static float one; - - - - a = *start; - one = 1.f; - rbase = one / *base; - zero = 0.f; - *emin = 1; - r__1 = a * rbase; - b1 = slamc3_(&r__1, &zero); - c1 = a; - c2 = a; - d1 = a; - d2 = a; -/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. - $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ -L10: - if (c1 == a && c2 == a && d1 == a && d2 == a) { - --(*emin); - a = b1; - r__1 = a / *base; - b1 = slamc3_(&r__1, &zero); - r__1 = b1 * *base; - c1 = slamc3_(&r__1, &zero); - d1 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d1 += b1; -/* L20: */ - } - r__1 = a * rbase; - b2 = slamc3_(&r__1, &zero); - r__1 = b2 / rbase; - c2 = slamc3_(&r__1, &zero); - d2 = zero; - i__1 = *base; - for (i = 1; i <= *base; ++i) { - d2 += b2; -/* L30: */ - } - goto L10; - } -/* + END WHILE */ - - return 0; - -/* End of SLAMC4 */ - -} /* slamc4_ */ - - -/* Subroutine */ int slamc5_(int *beta, int *p, int *emin, - int *ieee, int *emax, float *rmax) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - October 31, 1992 - - - Purpose - ======= - - SLAMC5 attempts to compute RMAX, the largest machine floating-point - number, without overflow. It assumes that EMAX + abs(EMIN) sum - approximately to a power of 2. It will fail on machines where this - assumption does not hold, for example, the Cyber 205 (EMIN = -28625, - - EMAX = 28718). It will also fail if the value supplied for EMIN is - too large (i.e. too close to zero), probably with overflow. - - Arguments - ========= - - BETA (input) INT - The base of floating-point arithmetic. - - P (input) INT - The number of base BETA digits in the mantissa of a - floating-point value. - - EMIN (input) INT - The minimum exponent before (gradual) underflow. - - IEEE (input) INT - A logical flag specifying whether or not the arithmetic - system is thought to comply with the IEEE standard. - - EMAX (output) INT - The largest exponent before overflow - - RMAX (output) FLOAT - The largest machine floating-point number. - - ===================================================================== - - - - First compute LEXP and UEXP, two powers of 2 that bound - abs(EMIN). We then assume that EMAX + abs(EMIN) will sum - approximately to the bound that is closest to abs(EMIN). - (EMAX is the exponent of the required number RMAX). */ - /* Table of constant values */ - static float c_b5 = 0.f; - - /* System generated locals */ - int i__1; - float r__1; - /* Local variables */ - static int lexp; - static float oldy; - static int uexp, i; - static float y, z; - static int nbits; - extern double slamc3_(float *, float *); - static float recbas; - static int exbits, expsum, try__; - - - - lexp = 1; - exbits = 1; -L10: - try__ = lexp << 1; - if (try__ <= -(*emin)) { - lexp = try__; - ++exbits; - goto L10; - } - if (lexp == -(*emin)) { - uexp = lexp; - } else { - uexp = try__; - ++exbits; - } - -/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater - than or equal to EMIN. EXBITS is the number of bits needed to - store the exponent. */ - - if (uexp + *emin > -lexp - *emin) { - expsum = lexp << 1; - } else { - expsum = uexp << 1; - } - -/* EXPSUM is the exponent range, approximately equal to - EMAX - EMIN + 1 . */ - - *emax = expsum + *emin - 1; - nbits = exbits + 1 + *p; - -/* NBITS is the total number of bits needed to store a - floating-point number. */ - - if (nbits % 2 == 1 && *beta == 2) { - -/* Either there are an odd number of bits used to store a - floating-point number, which is unlikely, or some bits are - - not used in the representation of numbers, which is possible -, - (e.g. Cray machines) or the mantissa has an implicit bit, - (e.g. IEEE machines, Dec Vax machines), which is perhaps the - - most likely. We have to assume the last alternative. - If this is true, then we need to reduce EMAX by one because - - there must be some way of representing zero in an implicit-b -it - system. On machines like Cray, we are reducing EMAX by one - - unnecessarily. */ - - --(*emax); - } - - if (*ieee) { - -/* Assume we are on an IEEE machine which reserves one exponent - - for infinity and NaN. */ - - --(*emax); - } - -/* Now create RMAX, the largest machine number, which should - be equal to (1.0 - BETA**(-P)) * BETA**EMAX . - - First compute 1.0 - BETA**(-P), being careful that the - result is less than 1.0 . */ - - recbas = 1.f / *beta; - z = *beta - 1.f; - y = 0.f; - i__1 = *p; - for (i = 1; i <= *p; ++i) { - z *= recbas; - if (y < 1.f) { - oldy = y; - } - y = slamc3_(&y, &z); -/* L20: */ - } - if (y >= 1.f) { - y = oldy; - } - -/* Now multiply by BETA**EMAX to get RMAX. */ - - i__1 = *emax; - for (i = 1; i <= *emax; ++i) { - r__1 = y * *beta; - y = slamc3_(&r__1, &c_b5); -/* L30: */ - } - - *rmax = y; - return 0; - -/* End of SLAMC5 */ - -} /* slamc5_ */ - - -double pow_ri(float *ap, int *bp) -{ -double pow, x; -int n; - -pow = 1; -x = *ap; -n = *bp; - -if(n != 0) - { - if(n < 0) - { - n = -n; - x = 1/x; - } - for( ; ; ) - { - if(n & 01) - pow *= x; - if(n >>= 1) - x *= x; - else - break; - } - } -return(pow); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slangs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,110 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: slangs.c - * History: Modified from lapack routine SLANGE - */ -#include -#include "slu_sdefs.h" - -float slangs(char *norm, SuperMatrix *A) -{ -/* - Purpose - ======= - - SLANGS returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - SLANGE returns the value - - SLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in SLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - float *Aval; - int i, j, irow; - float value, sum; - float *rwork; - - Astore = A->Store; - Aval = Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - value = SUPERLU_MAX( value, fabs( Aval[i]) ); - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += fabs(Aval[i]); - value = SUPERLU_MAX(value,sum); - } - - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) ) - ABORT("SUPERLU_MALLOC fails for rwork."); - for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { - irow = Astore->rowind[i]; - rwork[irow] += fabs(Aval[i]); - } - value = 0.; - for (i = 0; i < A->nrow; ++i) - value = SUPERLU_MAX(value, rwork[i]); - - SUPERLU_FREE (rwork); - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else - ABORT("Illegal norm specified."); - - return (value); - -} /* slangs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slaqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slaqgs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,136 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: slaqgs.c - * History: Modified from LAPACK routine SLAQGE - */ -#include -#include "slu_sdefs.h" - -void -slaqgs(SuperMatrix *A, float *r, float *c, - float rowcnd, float colcnd, float amax, char *equed) -{ -/* - Purpose - ======= - - SLAQGS equilibrates a general sparse M by N matrix A using the row and - scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = NC; Dtype = SLU_S; Mtype = GE. - - R (input) float*, dimension (A->nrow) - The row scale factors for A. - - C (input) float*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) float - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) float - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) float - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NCformat *Astore; - float *Aval; - int i, j, irow; - float large, small, cj; - extern double slamch_(char *); - - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - - /* Initialize LARGE and SMALL. */ - small = slamch_("Safe minimum") / slamch_("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - Aval[i] *= cj; - } - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - Aval[i] *= r[irow]; - } - *(unsigned char *)equed = 'R'; - } else { - /* Row and column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - Aval[i] *= cj * r[irow]; - } - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* slaqgs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_cdefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_cdefs.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_cdefs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_cdefs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,237 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#ifndef __SUPERLU_cSP_DEFS /* allow multiple inclusions */ -#define __SUPERLU_cSP_DEFS - -/* - * File name: csp_defs.h - * Purpose: Sparse matrix types and function prototypes - * History: - */ - -#ifdef _CRAY -#include -#include -#endif - -/* Define my integer type int_t */ -typedef int int_t; /* default */ - -#include "slu_Cnames.h" -#include "supermatrix.h" -#include "slu_util.h" -#include "slu_scomplex.h" - - -/* - * Global data structures used in LU factorization - - * - * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper]. - * (xsup,supno): supno[i] is the supernode no to which i belongs; - * xsup(s) points to the beginning of the s-th supernode. - * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12) - * xsup 0 1 2 4 7 12 - * Note: dfs will be performed on supernode rep. relative to the new - * row pivoting ordering - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xlsub[j] points to the starting - * location of the j-th column in lsub[*]. Note that xlsub - * is indexed by column. - * Storage: original row subscripts - * - * During the course of sparse LU factorization, we also use - * (xlsub,lsub) for the purpose of symmetric pruning. For each - * supernode {s,s+1,...,t=s+r} with first column s and last - * column t, the subscript set - * lsub[j], j=xlsub[s], .., xlsub[s+1]-1 - * is the structure of column s (i.e. structure of this supernode). - * It is used for the storage of numerical values. - * Furthermore, - * lsub[j], j=xlsub[t], .., xlsub[t+1]-1 - * is the structure of the last column t of this supernode. - * It is for the purpose of symmetric pruning. Therefore, the - * structural subscripts can be rearranged without making physical - * interchanges among the numerical values. - * - * However, if the supernode has only one column, then we - * only keep one set of subscripts. For any subscript interchange - * performed, similar interchange must be done on the numerical - * values. - * - * The last column structures (for pruning) will be removed - * after the numercial LU factorization phase. - * - * (xlusup,lusup): lusup[*] contains the numerical values of the - * rectangular supernodes; xlusup[j] points to the starting - * location of the j-th column in storage vector lusup[*] - * Note: xlusup is indexed by column. - * Each rectangular supernode is stored by column-major - * scheme, consistent with Fortran 2-dim array storage. - * - * (xusub,ucol,usub): ucol[*] stores the numerical values of - * U-columns outside the rectangular supernodes. The row - * subscript of nonzero ucol[k] is stored in usub[k]. - * xusub[i] points to the starting location of column i in ucol. - * Storage: new row subscripts; that is subscripts of PA. - */ -typedef struct { - int *xsup; /* supernode and column mapping */ - int *supno; - int *lsub; /* compressed L subscripts */ - int *xlsub; - complex *lusup; /* L supernodes */ - int *xlusup; - complex *ucol; /* U columns */ - int *usub; - int *xusub; - int nzlmax; /* current max size of lsub */ - int nzumax; /* " " " ucol */ - int nzlumax; /* " " " lusup */ - int n; /* number of columns in the matrix */ - LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ -} GlobalLU_t; - -typedef struct { - float for_lu; - float total_needed; - int expansions; -} mem_usage_t; - -#ifdef __cplusplus -extern "C" { -#endif - -/* Driver routines */ -extern void -cgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); -extern void -cgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, - char *, float *, float *, SuperMatrix *, SuperMatrix *, - void *, int, SuperMatrix *, SuperMatrix *, - float *, float *, float *, float *, - mem_usage_t *, SuperLUStat_t *, int *); - -/* Supernodal LU factor related */ -extern void -cCreate_CompCol_Matrix(SuperMatrix *, int, int, int, complex *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -cCreate_CompRow_Matrix(SuperMatrix *, int, int, int, complex *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -cCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); -extern void -cCreate_Dense_Matrix(SuperMatrix *, int, int, complex *, int, - Stype_t, Dtype_t, Mtype_t); -extern void -cCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, complex *, - int *, int *, int *, int *, int *, - Stype_t, Dtype_t, Mtype_t); -extern void -cCopy_Dense_Matrix(int, int, complex *, int, complex *, int); - -extern void countnz (const int, int *, int *, int *, GlobalLU_t *); -extern void fixupL (const int, const int *, GlobalLU_t *); - -extern void callocateA (int, int, complex **, int **, int **); -extern void cgstrf (superlu_options_t*, SuperMatrix*, float, - int, int, int*, void *, int, int *, int *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); -extern int csnode_dfs (const int, const int, const int *, const int *, - const int *, int *, int *, GlobalLU_t *); -extern int csnode_bmod (const int, const int, const int, complex *, - complex *, GlobalLU_t *, SuperLUStat_t*); -extern void cpanel_dfs (const int, const int, const int, SuperMatrix *, - int *, int *, complex *, int *, int *, int *, - int *, int *, int *, int *, GlobalLU_t *); -extern void cpanel_bmod (const int, const int, const int, const int, - complex *, complex *, int *, int *, - GlobalLU_t *, SuperLUStat_t*); -extern int ccolumn_dfs (const int, const int, int *, int *, int *, int *, - int *, int *, int *, int *, int *, GlobalLU_t *); -extern int ccolumn_bmod (const int, const int, complex *, - complex *, int *, int *, int, - GlobalLU_t *, SuperLUStat_t*); -extern int ccopy_to_ucol (int, int, int *, int *, int *, - complex *, GlobalLU_t *); -extern int cpivotL (const int, const float, int *, int *, - int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); -extern void cpruneL (const int, const int *, const int, const int, - const int *, const int *, int *, GlobalLU_t *); -extern void creadmt (int *, int *, int *, complex **, int **, int **); -extern void cGenXtrue (int, int, complex *, int); -extern void cFillRHS (trans_t, int, complex *, int, SuperMatrix *, - SuperMatrix *); -extern void cgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, - SuperMatrix *, SuperLUStat_t*, int *); - - -/* Driver related */ - -extern void cgsequ (SuperMatrix *, float *, float *, float *, - float *, float *, int *); -extern void claqgs (SuperMatrix *, float *, float *, float, - float, float, char *); -extern void cgscon (char *, SuperMatrix *, SuperMatrix *, - float, float *, SuperLUStat_t*, int *); -extern float cPivotGrowth(int, SuperMatrix *, int *, - SuperMatrix *, SuperMatrix *); -extern void cgsrfs (trans_t, SuperMatrix *, SuperMatrix *, - SuperMatrix *, int *, int *, char *, float *, - float *, SuperMatrix *, SuperMatrix *, - float *, float *, SuperLUStat_t*, int *); - -extern int sp_ctrsv (char *, char *, char *, SuperMatrix *, - SuperMatrix *, complex *, SuperLUStat_t*, int *); -extern int sp_cgemv (char *, complex, SuperMatrix *, complex *, - int, complex, complex *, int); - -extern int sp_cgemm (char *, char *, int, int, int, complex, - SuperMatrix *, complex *, int, complex, - complex *, int); - -/* Memory-related */ -extern int cLUMemInit (fact_t, void *, int, int, int, int, int, - SuperMatrix *, SuperMatrix *, - GlobalLU_t *, int **, complex **); -extern void cSetRWork (int, int, complex *, complex **, complex **); -extern void cLUWorkFree (int *, complex *, GlobalLU_t *); -extern int cLUMemXpand (int, int, MemType, int *, GlobalLU_t *); - -extern complex *complexMalloc(int); -extern complex *complexCalloc(int); -extern float *floatMalloc(int); -extern float *floatCalloc(int); -extern int cmemory_usage(const int, const int, const int, const int); -extern int cQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); - -/* Auxiliary routines */ -extern void creadhb(int *, int *, int *, complex **, int **, int **); -extern void cCompRow_to_CompCol(int, int, int, complex*, int*, int*, - complex **, int **, int **); -extern void cfill (complex *, int, complex); -extern void cinf_norm_error (int, SuperMatrix *, complex *); -extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, - complex, complex, complex *, complex *, char *); - -/* Routines for debugging */ -extern void cPrint_CompCol_Matrix(char *, SuperMatrix *); -extern void cPrint_SuperNode_Matrix(char *, SuperMatrix *); -extern void cPrint_Dense_Matrix(char *, SuperMatrix *); -extern void print_lu_col(char *, int, int, int *, GlobalLU_t *); -extern void check_tempv(int, complex *); - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_cSP_DEFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_Cnames.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_Cnames.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_Cnames.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_Cnames.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,335 +0,0 @@ -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 1, 1997 - * - */ -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. ADD_ assumes that - * they will be called by fortran, which expects C routines to have an - * underscore postfixed to the name (Suns, and the Intel expect this). - * NOCHANGE indicates that fortran will be calling, and that it expects - * the name called by fortran to be identical to that compiled by the C - * (RS6K's do this). UPCASE says it expects C routines called by fortran - * to be in all upcase (CRAY wants this). - */ - -#define ADD_ 0 -#define ADD__ 1 -#define NOCHANGE 2 -#define UPCASE 3 -#define C_CALL 4 - -#ifdef UpCase -#define F77_CALL_C UPCASE -#endif - -#ifdef NoChange -#define F77_CALL_C NOCHANGE -#endif - -#ifdef Add_ -#define F77_CALL_C ADD_ -#endif - -#ifdef Add__ -#define F77_CALL_C ADD__ -#endif - -/* Default */ -#ifndef F77_CALL_C -#define F77_CALL_C ADD_ -#endif - - -#if (F77_CALL_C == ADD_) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm_(...) - * - * This is the default. - */ - -#endif - -#if (F77_CALL_C == ADD__) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm__(...) - */ -/* BLAS */ -#define sasum_ sasum__ -#define isamax_ isamax__ -#define scopy_ scopy__ -#define sscal_ sscal__ -#define sger_ sger__ -#define snrm2_ snrm2__ -#define ssymv_ ssymv__ -#define sdot_ sdot__ -#define saxpy_ saxpy__ -#define ssyr2_ ssyr2__ -#define srot_ srot__ -#define sgemv_ sgemv__ -#define strsv_ strsv__ -#define sgemm_ sgemm__ -#define strsm_ strsm__ - -#define dasum_ dasum__ -#define idamax_ idamax__ -#define dcopy_ dcopy__ -#define dscal_ dscal__ -#define dger_ dger__ -#define dnrm2_ dnrm2__ -#define dsymv_ dsymv__ -#define ddot_ ddot__ -#define daxpy_ daxpy__ -#define dsyr2_ dsyr2__ -#define drot_ drot__ -#define dgemv_ dgemv__ -#define dtrsv_ dtrsv__ -#define dgemm_ dgemm__ -#define dtrsm_ dtrsm__ - -#define scasum_ scasum__ -#define icamax_ icamax__ -#define ccopy_ ccopy__ -#define cscal_ cscal__ -#define scnrm2_ scnrm2__ -#define caxpy_ caxpy__ -#define cgemv_ cgemv__ -#define ctrsv_ ctrsv__ -#define cgemm_ cgemm__ -#define ctrsm_ ctrsm__ -#define cgerc_ cgerc__ -#define chemv_ chemv__ -#define cher2_ cher2__ - -#define dzasum_ dzasum__ -#define izamax_ izamax__ -#define zcopy_ zcopy__ -#define zscal_ zscal__ -#define dznrm2_ dznrm2__ -#define zaxpy_ zaxpy__ -#define zgemv_ zgemv__ -#define ztrsv_ ztrsv__ -#define zgemm_ zgemm__ -#define ztrsm_ ztrsm__ -#define zgerc_ zgerc__ -#define zhemv_ zhemv__ -#define zher2_ zher2__ - -/* LAPACK */ -#define dlamch_ dlamch__ -#define slamch_ slamch__ -#define xerbla_ xerbla__ -#define lsame_ lsame__ -#define dlacon_ dlacon__ -#define slacon_ slacon__ -#define icmax1_ icmax1__ -#define scsum1_ scsum1__ -#define clacon_ clacon__ -#define dzsum1_ dzsum1__ -#define izmax1_ izmax1__ -#define zlacon_ zlacon__ - -/* Fortran interface */ -#define c_bridge_dgssv_ c_bridge_dgssv__ -#define c_fortran_sgssv_ c_fortran_sgssv__ -#define c_fortran_dgssv_ c_fortran_dgssv__ -#define c_fortran_cgssv_ c_fortran_cgssv__ -#define c_fortran_zgssv_ c_fortran_zgssv__ -#endif - -#if (F77_CALL_C == UPCASE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void DGEMM(...) - */ -/* BLAS */ -#define sasum_ SASUM -#define isamax_ ISAMAX -#define scopy_ SCOPY -#define sscal_ SSCAL -#define sger_ SGER -#define snrm2_ SNRM2 -#define ssymv_ SSYMV -#define sdot_ SDOT -#define saxpy_ SAXPY -#define ssyr2_ SSYR2 -#define srot_ SROT -#define sgemv_ SGEMV -#define strsv_ STRSV -#define sgemm_ SGEMM -#define strsm_ STRSM - -#define dasum_ SASUM -#define idamax_ ISAMAX -#define dcopy_ SCOPY -#define dscal_ SSCAL -#define dger_ SGER -#define dnrm2_ SNRM2 -#define dsymv_ SSYMV -#define ddot_ SDOT -#define daxpy_ SAXPY -#define dsyr2_ SSYR2 -#define drot_ SROT -#define dgemv_ SGEMV -#define dtrsv_ STRSV -#define dgemm_ SGEMM -#define dtrsm_ STRSM - -#define scasum_ SCASUM -#define icamax_ ICAMAX -#define ccopy_ CCOPY -#define cscal_ CSCAL -#define scnrm2_ SCNRM2 -#define caxpy_ CAXPY -#define cgemv_ CGEMV -#define ctrsv_ CTRSV -#define cgemm_ CGEMM -#define ctrsm_ CTRSM -#define cgerc_ CGERC -#define chemv_ CHEMV -#define cher2_ CHER2 - -#define dzasum_ SCASUM -#define izamax_ ICAMAX -#define zcopy_ CCOPY -#define zscal_ CSCAL -#define dznrm2_ SCNRM2 -#define zaxpy_ CAXPY -#define zgemv_ CGEMV -#define ztrsv_ CTRSV -#define zgemm_ CGEMM -#define ztrsm_ CTRSM -#define zgerc_ CGERC -#define zhemv_ CHEMV -#define zher2_ CHER2 - -/* LAPACK */ -#define dlamch_ DLAMCH -#define slamch_ SLAMCH -#define xerbla_ XERBLA -#define lsame_ LSAME -#define dlacon_ DLACON -#define slacon_ SLACON -#define icmax1_ ICMAX1 -#define scsum1_ SCSUM1 -#define clacon_ CLACON -#define dzsum1_ DZSUM1 -#define izmax1_ IZMAX1 -#define zlacon_ ZLACON - -/* Fortran interface */ -#define c_bridge_dgssv_ C_BRIDGE_DGSSV -#define c_fortran_sgssv_ C_FORTRAN_SGSSV -#define c_fortran_dgssv_ C_FORTRAN_DGSSV -#define c_fortran_cgssv_ C_FORTRAN_CGSSV -#define c_fortran_zgssv_ C_FORTRAN_ZGSSV -#endif - -#if (F77_CALL_C == NOCHANGE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm(...) - */ -/* BLAS */ -#define sasum_ sasum -#define isamax_ isamax -#define scopy_ scopy -#define sscal_ sscal -#define sger_ sger -#define snrm2_ snrm2 -#define ssymv_ ssymv -#define sdot_ sdot -#define saxpy_ saxpy -#define ssyr2_ ssyr2 -#define srot_ srot -#define sgemv_ sgemv -#define strsv_ strsv -#define sgemm_ sgemm -#define strsm_ strsm - -#define dasum_ dasum -#define idamax_ idamax -#define dcopy_ dcopy -#define dscal_ dscal -#define dger_ dger -#define dnrm2_ dnrm2 -#define dsymv_ dsymv -#define ddot_ ddot -#define daxpy_ daxpy -#define dsyr2_ dsyr2 -#define drot_ drot -#define dgemv_ dgemv -#define dtrsv_ dtrsv -#define dgemm_ dgemm -#define dtrsm_ dtrsm - -#define scasum_ scasum -#define icamax_ icamax -#define ccopy_ ccopy -#define cscal_ cscal -#define scnrm2_ scnrm2 -#define caxpy_ caxpy -#define cgemv_ cgemv -#define ctrsv_ ctrsv -#define cgemm_ cgemm -#define ctrsm_ ctrsm -#define cgerc_ cgerc -#define chemv_ chemv -#define cher2_ cher2 - -#define dzasum_ dzasum -#define izamax_ izamax -#define zcopy_ zcopy -#define zscal_ zscal -#define dznrm2_ dznrm2 -#define zaxpy_ zaxpy -#define zgemv_ zgemv -#define ztrsv_ ztrsv -#define zgemm_ zgemm -#define ztrsm_ ztrsm -#define zgerc_ zgerc -#define zhemv_ zhemv -#define zher2_ zher2 - -/* LAPACK */ -#define dlamch_ dlamch -#define slamch_ slamch -#define xerbla_ xerbla -#define lsame_ lsame -#define dlacon_ dlacon -#define slacon_ slacon -#define icmax1_ icmax1 -#define scsum1_ scsum1 -#define clacon_ clacon -#define dzsum1_ dzsum1 -#define izmax1_ izmax1 -#define zlacon_ zlacon - -/* Fortran interface */ -#define c_bridge_dgssv_ c_bridge_dgssv -#define c_fortran_sgssv_ c_fortran_sgssv -#define c_fortran_dgssv_ c_fortran_dgssv -#define c_fortran_cgssv_ c_fortran_cgssv -#define c_fortran_zgssv_ c_fortran_zgssv -#endif - -#endif /* __SUPERLU_CNAMES */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */ -#define __SUPERLU_DCOMPLEX - -/* - * This header file is to be included in source files z*.c - */ -#ifndef DCOMPLEX_INCLUDE -#define DCOMPLEX_INCLUDE - -typedef struct { double r, i; } doublecomplex; - - -/* Macro definitions */ - -/* Complex Addition c = a + b */ -#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \ - (c)->i = (a)->i + (b)->i; } - -/* Complex Subtraction c = a - b */ -#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \ - (c)->i = (a)->i - (b)->i; } - -/* Complex-Double Multiplication */ -#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \ - (c)->i = (a)->i * (b); } - -/* Complex-Complex Multiplication */ -#define zz_mult(c, a, b) { \ - double cr, ci; \ - cr = (a)->r * (b)->r - (a)->i * (b)->i; \ - ci = (a)->i * (b)->r + (a)->r * (b)->i; \ - (c)->r = cr; \ - (c)->i = ci; \ - } - -#define zz_conj(a, b) { \ - (a)->r = (b)->r; \ - (a)->i = -((b)->i); \ - } - -/* Complex equality testing */ -#define z_eq(a, b) ( (a)->r == (b)->r && (a)->i == (b)->i ) - - -#ifdef __cplusplus -extern "C" { -#endif - -/* Prototypes for functions in dcomplex.c */ -void z_div(doublecomplex *, doublecomplex *, doublecomplex *); -double z_abs(doublecomplex *); /* exact */ -double z_abs1(doublecomplex *); /* approximate */ -void z_exp(doublecomplex *, doublecomplex *); -void d_cnjg(doublecomplex *r, doublecomplex *z); -double d_imag(doublecomplex *); - - -#ifdef __cplusplus - } -#endif - -#endif - -#endif /* __SUPERLU_DCOMPLEX */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_ddefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_ddefs.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_ddefs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_ddefs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,240 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#ifndef __SUPERLU_dSP_DEFS /* allow multiple inclusions */ -#define __SUPERLU_dSP_DEFS - -/* - * File name: dsp_defs.h - * Purpose: Sparse matrix types and function prototypes - * History: - */ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include -#ifdef _CRAY -#include -#endif - -/* Define my integer type int_t */ -typedef int int_t; /* default */ - -/* Added by Rob Falgout temporarily */ -//typedef int logical; - -#include "supermatrix.h" -#include "slu_util.h" - -/* - * Global data structures used in LU factorization - - * - * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper]. - * (xsup,supno): supno[i] is the supernode no to which i belongs; - * xsup(s) points to the beginning of the s-th supernode. - * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12) - * xsup 0 1 2 4 7 12 - * Note: dfs will be performed on supernode rep. relative to the new - * row pivoting ordering - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xlsub[j] points to the starting - * location of the j-th column in lsub[*]. Note that xlsub - * is indexed by column. - * Storage: original row subscripts - * - * During the course of sparse LU factorization, we also use - * (xlsub,lsub) for the purpose of symmetric pruning. For each - * supernode {s,s+1,...,t=s+r} with first column s and last - * column t, the subscript set - * lsub[j], j=xlsub[s], .., xlsub[s+1]-1 - * is the structure of column s (i.e. structure of this supernode). - * It is used for the storage of numerical values. - * Furthermore, - * lsub[j], j=xlsub[t], .., xlsub[t+1]-1 - * is the structure of the last column t of this supernode. - * It is for the purpose of symmetric pruning. Therefore, the - * structural subscripts can be rearranged without making physical - * interchanges among the numerical values. - * - * However, if the supernode has only one column, then we - * only keep one set of subscripts. For any subscript interchange - * performed, similar interchange must be done on the numerical - * values. - * - * The last column structures (for pruning) will be removed - * after the numercial LU factorization phase. - * - * (xlusup,lusup): lusup[*] contains the numerical values of the - * rectangular supernodes; xlusup[j] points to the starting - * location of the j-th column in storage vector lusup[*] - * Note: xlusup is indexed by column. - * Each rectangular supernode is stored by column-major - * scheme, consistent with Fortran 2-dim array storage. - * - * (xusub,ucol,usub): ucol[*] stores the numerical values of - * U-columns outside the rectangular supernodes. The row - * subscript of nonzero ucol[k] is stored in usub[k]. - * xusub[i] points to the starting location of column i in ucol. - * Storage: new row subscripts; that is subscripts of PA. - */ -typedef struct { - int *xsup; /* supernode and column mapping */ - int *supno; - int *lsub; /* compressed L subscripts */ - int *xlsub; - double *lusup; /* L supernodes */ - int *xlusup; - double *ucol; /* U columns */ - int *usub; - int *xusub; - int nzlmax; /* current max size of lsub */ - int nzumax; /* " " " ucol */ - int nzlumax; /* " " " lusup */ - int n; /* number of columns in the matrix */ - LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ -} GlobalLU_t; - -typedef struct { - float for_lu; - float total_needed; - int expansions; -} mem_usage_t; - -#ifdef __cplusplus -extern "C" { -#endif - -/* Driver routines */ -extern void -dgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); -extern void -dgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, - char *, double *, double *, SuperMatrix *, SuperMatrix *, - void *, int, SuperMatrix *, SuperMatrix *, - double *, double *, double *, double *, - mem_usage_t *, SuperLUStat_t *, int *); - -/* Supernodal LU factor related */ -extern void -dCreate_CompCol_Matrix(SuperMatrix *, int, int, int, double *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -dCreate_CompRow_Matrix(SuperMatrix *, int, int, int, double *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -dCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); -extern void -dCreate_Dense_Matrix(SuperMatrix *, int, int, double *, int, - Stype_t, Dtype_t, Mtype_t); -extern void -dCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, double *, - int *, int *, int *, int *, int *, - Stype_t, Dtype_t, Mtype_t); -extern void -dCopy_Dense_Matrix(int, int, double *, int, double *, int); - -extern void countnz (const int, int *, int *, int *, GlobalLU_t *); -extern void fixupL (const int, const int *, GlobalLU_t *); - -extern void dallocateA (int, int, double **, int **, int **); -extern void dgstrf (superlu_options_t*, SuperMatrix*, double, - int, int, int*, void *, int, int *, int *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); -extern int dsnode_dfs (const int, const int, const int *, const int *, - const int *, int *, int *, GlobalLU_t *); -extern int dsnode_bmod (const int, const int, const int, double *, - double *, GlobalLU_t *, SuperLUStat_t*); -extern void dpanel_dfs (const int, const int, const int, SuperMatrix *, - int *, int *, double *, int *, int *, int *, - int *, int *, int *, int *, GlobalLU_t *); -extern void dpanel_bmod (const int, const int, const int, const int, - double *, double *, int *, int *, - GlobalLU_t *, SuperLUStat_t*); -extern int dcolumn_dfs (const int, const int, int *, int *, int *, int *, - int *, int *, int *, int *, int *, GlobalLU_t *); -extern int dcolumn_bmod (const int, const int, double *, - double *, int *, int *, int, - GlobalLU_t *, SuperLUStat_t*); -extern int dcopy_to_ucol (int, int, int *, int *, int *, - double *, GlobalLU_t *); -extern int dpivotL (const int, const double, int *, int *, - int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); -extern void dpruneL (const int, const int *, const int, const int, - const int *, const int *, int *, GlobalLU_t *); -extern void dreadmt (int *, int *, int *, double **, int **, int **); -extern void dGenXtrue (int, int, double *, int); -extern void dFillRHS (trans_t, int, double *, int, SuperMatrix *, - SuperMatrix *); -extern void dgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, - SuperMatrix *, SuperLUStat_t*, int *); - - -/* Driver related */ - -extern void dgsequ (SuperMatrix *, double *, double *, double *, - double *, double *, int *); -extern void dlaqgs (SuperMatrix *, double *, double *, double, - double, double, char *); -extern void dgscon (char *, SuperMatrix *, SuperMatrix *, - double, double *, SuperLUStat_t*, int *); -extern double dPivotGrowth(int, SuperMatrix *, int *, - SuperMatrix *, SuperMatrix *); -extern void dgsrfs (trans_t, SuperMatrix *, SuperMatrix *, - SuperMatrix *, int *, int *, char *, double *, - double *, SuperMatrix *, SuperMatrix *, - double *, double *, SuperLUStat_t*, int *); - -extern int sp_dtrsv (const char *,const char *,const char *, SuperMatrix *, - SuperMatrix *, double *, SuperLUStat_t*, int *); -extern int sp_dgemv (const char *, double, SuperMatrix *, double *, - int, double, double *, int); - -extern int sp_dgemm (const char *,const char *, int, int, int, double, - SuperMatrix *, double *, int, double, - double *, int); - -/* Memory-related */ -extern int dLUMemInit (fact_t, void *, int, int, int, int, int, - SuperMatrix *, SuperMatrix *, - GlobalLU_t *, int **, double **); -extern void dSetRWork (int, int, double *, double **, double **); -extern void dLUWorkFree (int *, double *, GlobalLU_t *); -extern int dLUMemXpand (int, int, MemType, int *, GlobalLU_t *); - -extern double *doubleMalloc(int); -extern double *doubleCalloc(int); -extern int dmemory_usage(const int, const int, const int, const int); -extern int dQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); - -/* Auxiliary routines */ -extern void dreadhb(int *, int *, int *, double **, int **, int **); -extern void dCompRow_to_CompCol(int, int, int, double*, int*, int*, - double **, int **, int **); -extern void dfill (double *, int, double); -extern void dinf_norm_error (int, SuperMatrix *, double *); -extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, - double, double, double *, double *, char *); - -/* Routines for debugging */ -extern void dPrint_CompCol_Matrix(char *, SuperMatrix *); -extern void dPrint_SuperNode_Matrix(char *, SuperMatrix *); -extern void dPrint_Dense_Matrix(char *, SuperMatrix *); -extern void print_lu_col(char *, int, int, int *, GlobalLU_t *); -extern void check_tempv(int, double *); - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_dSP_DEFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_scomplex.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_scomplex.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_scomplex.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_scomplex.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#ifndef __SUPERLU_SCOMPLEX /* allow multiple inclusions */ -#define __SUPERLU_SCOMPLEX - -/* - * This header file is to be included in source files c*.c - */ -#ifndef SCOMPLEX_INCLUDE -#define SCOMPLEX_INCLUDE - -typedef struct { float r, i; } complex; - - -/* Macro definitions */ - -/* Complex Addition c = a + b */ -#define c_add(c, a, b) { (c)->r = (a)->r + (b)->r; \ - (c)->i = (a)->i + (b)->i; } - -/* Complex Subtraction c = a - b */ -#define c_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \ - (c)->i = (a)->i - (b)->i; } - -/* Complex-Double Multiplication */ -#define cs_mult(c, a, b) { (c)->r = (a)->r * (b); \ - (c)->i = (a)->i * (b); } - -/* Complex-Complex Multiplication */ -#define cc_mult(c, a, b) { \ - float cr, ci; \ - cr = (a)->r * (b)->r - (a)->i * (b)->i; \ - ci = (a)->i * (b)->r + (a)->r * (b)->i; \ - (c)->r = cr; \ - (c)->i = ci; \ - } - -#define cc_conj(a, b) { \ - (a)->r = (b)->r; \ - (a)->i = -((b)->i); \ - } - -/* Complex equality testing */ -#define c_eq(a, b) ( (a)->r == (b)->r && (a)->i == (b)->i ) - - -#ifdef __cplusplus -extern "C" { -#endif - -/* Prototypes for functions in scomplex.c */ -void c_div(complex *, complex *, complex *); -double c_abs(complex *); /* exact */ -double c_abs1(complex *); /* approximate */ -void c_exp(complex *, complex *); -void r_cnjg(complex *, complex *); -double r_imag(complex *); - - -#ifdef __cplusplus - } -#endif - -#endif - -#endif /* __SUPERLU_SCOMPLEX */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_sdefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_sdefs.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_sdefs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_sdefs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,234 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#ifndef __SUPERLU_sSP_DEFS /* allow multiple inclusions */ -#define __SUPERLU_sSP_DEFS - -/* - * File name: ssp_defs.h - * Purpose: Sparse matrix types and function prototypes - * History: - */ - -#ifdef _CRAY -#include -#include -#endif - -/* Define my integer type int_t */ -typedef int int_t; /* default */ - -#include "slu_Cnames.h" -#include "supermatrix.h" -#include "slu_util.h" - - -/* - * Global data structures used in LU factorization - - * - * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper]. - * (xsup,supno): supno[i] is the supernode no to which i belongs; - * xsup(s) points to the beginning of the s-th supernode. - * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12) - * xsup 0 1 2 4 7 12 - * Note: dfs will be performed on supernode rep. relative to the new - * row pivoting ordering - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xlsub[j] points to the starting - * location of the j-th column in lsub[*]. Note that xlsub - * is indexed by column. - * Storage: original row subscripts - * - * During the course of sparse LU factorization, we also use - * (xlsub,lsub) for the purpose of symmetric pruning. For each - * supernode {s,s+1,...,t=s+r} with first column s and last - * column t, the subscript set - * lsub[j], j=xlsub[s], .., xlsub[s+1]-1 - * is the structure of column s (i.e. structure of this supernode). - * It is used for the storage of numerical values. - * Furthermore, - * lsub[j], j=xlsub[t], .., xlsub[t+1]-1 - * is the structure of the last column t of this supernode. - * It is for the purpose of symmetric pruning. Therefore, the - * structural subscripts can be rearranged without making physical - * interchanges among the numerical values. - * - * However, if the supernode has only one column, then we - * only keep one set of subscripts. For any subscript interchange - * performed, similar interchange must be done on the numerical - * values. - * - * The last column structures (for pruning) will be removed - * after the numercial LU factorization phase. - * - * (xlusup,lusup): lusup[*] contains the numerical values of the - * rectangular supernodes; xlusup[j] points to the starting - * location of the j-th column in storage vector lusup[*] - * Note: xlusup is indexed by column. - * Each rectangular supernode is stored by column-major - * scheme, consistent with Fortran 2-dim array storage. - * - * (xusub,ucol,usub): ucol[*] stores the numerical values of - * U-columns outside the rectangular supernodes. The row - * subscript of nonzero ucol[k] is stored in usub[k]. - * xusub[i] points to the starting location of column i in ucol. - * Storage: new row subscripts; that is subscripts of PA. - */ -typedef struct { - int *xsup; /* supernode and column mapping */ - int *supno; - int *lsub; /* compressed L subscripts */ - int *xlsub; - float *lusup; /* L supernodes */ - int *xlusup; - float *ucol; /* U columns */ - int *usub; - int *xusub; - int nzlmax; /* current max size of lsub */ - int nzumax; /* " " " ucol */ - int nzlumax; /* " " " lusup */ - int n; /* number of columns in the matrix */ - LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ -} GlobalLU_t; - -typedef struct { - float for_lu; - float total_needed; - int expansions; -} mem_usage_t; - -#ifdef __cplusplus -extern "C" { -#endif - -/* Driver routines */ -extern void -sgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); -extern void -sgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, - char *, float *, float *, SuperMatrix *, SuperMatrix *, - void *, int, SuperMatrix *, SuperMatrix *, - float *, float *, float *, float *, - mem_usage_t *, SuperLUStat_t *, int *); - -/* Supernodal LU factor related */ -extern void -sCreate_CompCol_Matrix(SuperMatrix *, int, int, int, float *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -sCreate_CompRow_Matrix(SuperMatrix *, int, int, int, float *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -sCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); -extern void -sCreate_Dense_Matrix(SuperMatrix *, int, int, float *, int, - Stype_t, Dtype_t, Mtype_t); -extern void -sCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, float *, - int *, int *, int *, int *, int *, - Stype_t, Dtype_t, Mtype_t); -extern void -sCopy_Dense_Matrix(int, int, float *, int, float *, int); - -extern void countnz (const int, int *, int *, int *, GlobalLU_t *); -extern void fixupL (const int, const int *, GlobalLU_t *); - -extern void sallocateA (int, int, float **, int **, int **); -extern void sgstrf (superlu_options_t*, SuperMatrix*, float, - int, int, int*, void *, int, int *, int *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); -extern int ssnode_dfs (const int, const int, const int *, const int *, - const int *, int *, int *, GlobalLU_t *); -extern int ssnode_bmod (const int, const int, const int, float *, - float *, GlobalLU_t *, SuperLUStat_t*); -extern void spanel_dfs (const int, const int, const int, SuperMatrix *, - int *, int *, float *, int *, int *, int *, - int *, int *, int *, int *, GlobalLU_t *); -extern void spanel_bmod (const int, const int, const int, const int, - float *, float *, int *, int *, - GlobalLU_t *, SuperLUStat_t*); -extern int scolumn_dfs (const int, const int, int *, int *, int *, int *, - int *, int *, int *, int *, int *, GlobalLU_t *); -extern int scolumn_bmod (const int, const int, float *, - float *, int *, int *, int, - GlobalLU_t *, SuperLUStat_t*); -extern int scopy_to_ucol (int, int, int *, int *, int *, - float *, GlobalLU_t *); -extern int spivotL (const int, const float, int *, int *, - int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); -extern void spruneL (const int, const int *, const int, const int, - const int *, const int *, int *, GlobalLU_t *); -extern void sreadmt (int *, int *, int *, float **, int **, int **); -extern void sGenXtrue (int, int, float *, int); -extern void sFillRHS (trans_t, int, float *, int, SuperMatrix *, - SuperMatrix *); -extern void sgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, - SuperMatrix *, SuperLUStat_t*, int *); - - -/* Driver related */ - -extern void sgsequ (SuperMatrix *, float *, float *, float *, - float *, float *, int *); -extern void slaqgs (SuperMatrix *, float *, float *, float, - float, float, char *); -extern void sgscon (char *, SuperMatrix *, SuperMatrix *, - float, float *, SuperLUStat_t*, int *); -extern float sPivotGrowth(int, SuperMatrix *, int *, - SuperMatrix *, SuperMatrix *); -extern void sgsrfs (trans_t, SuperMatrix *, SuperMatrix *, - SuperMatrix *, int *, int *, char *, float *, - float *, SuperMatrix *, SuperMatrix *, - float *, float *, SuperLUStat_t*, int *); - -extern int sp_strsv (const char *,const char *,const char *, SuperMatrix *, - SuperMatrix *, float *, SuperLUStat_t*, int *); -extern int sp_sgemv (char *, float, SuperMatrix *, float *, - int, float, float *, int); - -extern int sp_sgemm (char *, char *, int, int, int, float, - SuperMatrix *, float *, int, float, - float *, int); - -/* Memory-related */ -extern int sLUMemInit (fact_t, void *, int, int, int, int, int, - SuperMatrix *, SuperMatrix *, - GlobalLU_t *, int **, float **); -extern void sSetRWork (int, int, float *, float **, float **); -extern void sLUWorkFree (int *, float *, GlobalLU_t *); -extern int sLUMemXpand (int, int, MemType, int *, GlobalLU_t *); - -extern float *floatMalloc(int); -extern float *floatCalloc(int); -extern int smemory_usage(const int, const int, const int, const int); -extern int sQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); - -/* Auxiliary routines */ -extern void sreadhb(int *, int *, int *, float **, int **, int **); -extern void sCompRow_to_CompCol(int, int, int, float*, int*, int*, - float **, int **, int **); -extern void sfill (float *, int, float); -extern void sinf_norm_error (int, SuperMatrix *, float *); -extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, - float, float, float *, float *, char *); - -/* Routines for debugging */ -extern void sPrint_CompCol_Matrix(char *, SuperMatrix *); -extern void sPrint_SuperNode_Matrix(char *, SuperMatrix *); -extern void sPrint_Dense_Matrix(char *, SuperMatrix *); -extern void print_lu_col(char *, int, int, int *, GlobalLU_t *); -extern void check_tempv(int, float *); - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_sSP_DEFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,736 +0,0 @@ -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - -#include -#include "slu_ddefs.h" - -/* - * Global statistics variale - */ - -void superlu_abort_and_exit(char* msg) -{ - fprintf(stderr, "%s", msg); - exit (-1); -} - -/* - * Set the default values for the options argument. - */ -void set_default_options(superlu_options_t *options) -{ - options->Fact = DOFACT; - options->Equil = YES; - options->ColPerm = COLAMD; - options->DiagPivotThresh = 1.0; - options->Trans = NOTRANS; - options->IterRefine = NOREFINE; - options->SymmetricMode = NO; - options->PivotGrowth = NO; - options->ConditionNumber = NO; - options->PrintStat = YES; -} - -/* - * Print the options setting. - */ -void print_options(superlu_options_t *options) -{ - printf(".. options:\n"); - printf("\tFact\t %8d\n", options->Fact); - printf("\tEquil\t %8d\n", options->Equil); - printf("\tColPerm\t %8d\n", options->ColPerm); - printf("\tDiagPivotThresh %8.4f\n", options->DiagPivotThresh); - printf("\tTrans\t %8d\n", options->Trans); - printf("\tIterRefine\t%4d\n", options->IterRefine); - printf("\tSymmetricMode\t%4d\n", options->SymmetricMode); - printf("\tPivotGrowth\t%4d\n", options->PivotGrowth); - printf("\tConditionNumber\t%4d\n", options->ConditionNumber); - printf("..\n"); -} - -/* Deallocate the structure pointing to the actual storage of the matrix. */ -void -Destroy_SuperMatrix_Store(SuperMatrix *A) -{ - SUPERLU_FREE ( A->Store ); -} - -void -Destroy_CompCol_Matrix(SuperMatrix *A) -{ - SUPERLU_FREE( ((NCformat *)A->Store)->rowind ); - SUPERLU_FREE( ((NCformat *)A->Store)->colptr ); - SUPERLU_FREE( ((NCformat *)A->Store)->nzval ); - SUPERLU_FREE( A->Store ); -} - -void -Destroy_CompRow_Matrix(SuperMatrix *A) -{ - SUPERLU_FREE( ((NRformat *)A->Store)->colind ); - SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); - SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); - SUPERLU_FREE( A->Store ); -} - -void -Destroy_SuperNode_Matrix(SuperMatrix *A) -{ - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); - SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); - SUPERLU_FREE ( A->Store ); -} - -/* A is of type Stype==NCP */ -void -Destroy_CompCol_Permuted(SuperMatrix *A) -{ - SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); - SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); - SUPERLU_FREE ( A->Store ); -} - -/* A is of type Stype==DN */ -void -Destroy_Dense_Matrix(SuperMatrix *A) -{ - DNformat* Astore = (DNformat*) A->Store; - SUPERLU_FREE (Astore->nzval); - SUPERLU_FREE ( A->Store ); -} - -/* - * Reset repfnz[] for the current column - */ -void -resetrep_col (const int nseg, const int *segrep, int *repfnz) -{ - int i, irep; - - for (i = 0; i < nseg; i++) { - irep = segrep[i]; - repfnz[irep] = EMPTY; - } -} - - -/* - * Count the total number of nonzeros in factors L and U, and in the - * symmetrically reduced L. - */ -void -countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLU_t *Glu) -{ - int nsuper, fsupc, i, j; - int nnzL0, jlen, irep; - int *xsup, *xlsub; - - xsup = Glu->xsup; - xlsub = Glu->xlsub; - *nnzL = 0; - *nnzU = (Glu->xusub)[n]; - nnzL0 = 0; - nsuper = (Glu->supno)[n]; - - if ( n <= 0 ) return; - - /* - * For each supernode - */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jlen = xlsub[fsupc+1] - xlsub[fsupc]; - - for (j = fsupc; j < xsup[i+1]; j++) { - *nnzL += jlen; - *nnzU += j - fsupc + 1; - jlen--; - } - irep = xsup[i+1] - 1; - nnzL0 += xprune[irep] - xlsub[irep]; - } - - /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/ -} - - - -/* - * Fix up the data storage lsub for L-subscripts. It removes the subscript - * sets for structural pruning, and applies permuation to the remaining - * subscripts. - */ -void -fixupL(const int n, const int *perm_r, GlobalLU_t *Glu) -{ - register int nsuper, fsupc, nextl, i, j, k, jstrt; - int *xsup, *lsub, *xlsub; - - if ( n <= 1 ) return; - - xsup = Glu->xsup; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nextl = 0; - nsuper = (Glu->supno)[n]; - - /* - * For each supernode ... - */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jstrt = xlsub[fsupc]; - xlsub[fsupc] = nextl; - for (j = jstrt; j < xlsub[fsupc+1]; j++) { - lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ - nextl++; - } - for (k = fsupc+1; k < xsup[i+1]; k++) - xlsub[k] = nextl; /* Other columns in supernode i */ - - } - - xlsub[n] = nextl; -} - - -/* - * Diagnostic print of segment info after panel_dfs(). - */ -void print_panel_seg(int n, int w, int jcol, int nseg, - int *segrep, int *repfnz) -{ - int j, k; - - for (j = jcol; j < jcol+w; j++) { - printf("\tcol %d:\n", j); - for (k = 0; k < nseg; k++) - printf("\t\tseg %d, segrep %d, repfnz %d\n", k, - segrep[k], repfnz[(j-jcol)*n + segrep[k]]); - } - -} - - -void -StatInit(SuperLUStat_t *stat) -{ - register int i, w, panel_size, relax; - - panel_size = sp_ienv(1); - relax = sp_ienv(2); - w = SUPERLU_MAX(panel_size, relax); - stat->panel_histo = intCalloc(w+1); - stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double)); - if (!stat->utime) ABORT("SUPERLU_MALLOC fails for stat->utime"); - stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); - if (!stat->ops) ABORT("SUPERLU_MALLOC fails for stat->ops"); - for (i = 0; i < NPHASES; ++i) { - stat->utime[i] = 0.; - stat->ops[i] = 0.; - } -} - - -void -StatPrint(SuperLUStat_t *stat) -{ - double *utime; - flops_t *ops; - - utime = stat->utime; - ops = stat->ops; - printf("Factor time = %8.2f\n", utime[FACT]); - if ( utime[FACT] != 0.0 ) - printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], - ops[FACT]*1e-6/utime[FACT]); - - printf("Solve time = %8.2f\n", utime[SOLVE]); - if ( utime[SOLVE] != 0.0 ) - printf("Solve flops = %e\tMflops = %8.2f\n", ops[SOLVE], - ops[SOLVE]*1e-6/utime[SOLVE]); - -} - - -void -StatFree(SuperLUStat_t *stat) -{ - SUPERLU_FREE(stat->panel_histo); - SUPERLU_FREE(stat->utime); - SUPERLU_FREE(stat->ops); -} - - -flops_t -LUFactFlops(SuperLUStat_t *stat) -{ - return (stat->ops[FACT]); -} - -flops_t -LUSolveFlops(SuperLUStat_t *stat) -{ - return (stat->ops[SOLVE]); -} - - - - - -/* - * Fills an integer array with a given value. - */ -void ifill(int *a, int alen, int ival) -{ - register int i; - for (i = 0; i < alen; i++) a[i] = ival; -} - - - -/* - * Get the statistics of the supernodes - */ -#define NBUCKS 10 -static int max_sup_size; - -void super_stats(int nsuper, int *xsup) -{ - register int nsup1 = 0; - int i, isize, whichb, bl, bh; - int bucket[NBUCKS]; - - max_sup_size = 0; - - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - if ( isize == 1 ) nsup1++; - if ( max_sup_size < isize ) max_sup_size = isize; - } - - printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1); - printf("\tmax supernode size = %d\n", max_sup_size); - printf("\tno of size 1 supernodes = %d\n", nsup1); - - /* Histogram of the supernode sizes */ - ifill (bucket, NBUCKS, 0); - - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - whichb = (float) isize / max_sup_size * NBUCKS; - if (whichb >= NBUCKS) whichb = NBUCKS - 1; - bucket[whichb]++; - } - - printf("\tHistogram of supernode sizes:\n"); - for (i = 0; i < NBUCKS; i++) { - bl = (float) i * max_sup_size / NBUCKS; - bh = (float) (i+1) * max_sup_size / NBUCKS; - printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]); - } - -} - - -float SpaSize(int n, int np, float sum_npw) -{ - return (sum_npw*8 + np*8 + n*4)/1024.; -} - -float DenseSize(int n, float sum_nw) -{ - return (sum_nw*8 + n*8)/1024.;; -} - - - -/* - * Check whether repfnz[] == EMPTY after reset. - */ -void check_repfnz(int n, int w, int jcol, int *repfnz) -{ - int jj, k; - - for (jj = jcol; jj < jcol+w; jj++) - for (k = 0; k < n; k++) - if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { - fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, - k, repfnz[(jj-jcol)*n + k]); - ABORT("check_repfnz"); - } -} - - -/* Print a summary of the testing results. */ -void -PrintSumm(char *type, int nfail, int nrun, int nerrs) -{ - if ( nfail > 0 ) - printf("%3s driver: %d out of %d tests failed to pass the threshold\n", - type, nfail, nrun); - else - printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun); - - if ( nerrs > 0 ) - printf("%6d error messages recorded\n", nerrs); -} - - -int print_int_vec(char *what, int n, int *vec) -{ - int i; - printf("%s\n", what); - for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]); - return 0; -} - -int superlu_lsame(const char *ca,const char *cb) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - Purpose - ======= - - LSAME returns .TRUE. if CA is the same letter as CB regardless of case. - - Arguments - ========= - - CA (input) CHARACTER*1 - CB (input) CHARACTER*1 - CA and CB specify the single characters to be compared. - - ===================================================================== -*/ - - /* System generated locals */ - int ret_val; - - /* Local variables */ - int inta, intb, zcode; - - ret_val = *(unsigned char *)ca == *(unsigned char *)cb; - if (ret_val) { - return ret_val; - } - - /* Now test for equivalence if both characters are alphabetic. */ - - zcode = 'Z'; - - /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime - machines, on which ICHAR returns a value with bit 8 set. - ICHAR('A') on Prime machines returns 193 which is the same as - ICHAR('A') on an EBCDIC machine. */ - - inta = *(unsigned char *)ca; - intb = *(unsigned char *)cb; - - if (zcode == 90 || zcode == 122) { - /* ASCII is assumed - ZCODE is the ASCII code of either lower or - upper case 'Z'. */ - if (inta >= 97 && inta <= 122) inta += -32; - if (intb >= 97 && intb <= 122) intb += -32; - - } else if (zcode == 233 || zcode == 169) { - /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or - upper case 'Z'. */ - if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || - (inta >= 162 && inta <= 169)) - inta += 64; - if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || - (intb >= 162 && intb <= 169)) - intb += 64; - } else if (zcode == 218 || zcode == 250) { - /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code - plus 128 of either lower or upper case 'Z'. */ - if (inta >= 225 && inta <= 250) inta += -32; - if (intb >= 225 && intb <= 250) intb += -32; - } - ret_val = inta == intb; - return ret_val; - -} /* superlu_lsame */ - -/* Subroutine */ int superlu_xerbla(const char *srname, int *info) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - XERBLA is an error handler for the LAPACK routines. - It is called by an LAPACK routine if an input parameter has an - invalid value. A message is printed and execution stops. - - Installers may consider modifying the STOP statement in order to - call system-specific exception-handling facilities. - - Arguments - ========= - - SRNAME (input) CHARACTER*6 - The name of the routine which called XERBLA. - - INFO (input) INT - The position of the invalid parameter in the parameter list - - of the calling routine. - - ===================================================================== -*/ - - printf("** On entry to %6s, parameter number %2d had an illegal value\n", - srname, *info); - -/* End of XERBLA */ - - return 0; -} /* superlu_xerbla */ - - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: dmyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void sludlsolve ( int ldm, int ncol, double *M, double *rhs ) -{ - int k; - double x0, x1, x2, x3, x4, x5, x6, x7; - double *M0; - register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - - M0 = &M[0]; - - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - Mki4 = Mki3 + ldm + 1; - Mki5 = Mki4 + ldm + 1; - Mki6 = Mki5 + ldm + 1; - Mki7 = Mki6 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++; - x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++; - x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; - x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - rhs[++firstcol] = x4; - rhs[++firstcol] = x5; - rhs[++firstcol] = x6; - rhs[++firstcol] = x7; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++ - - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++ - x7 * *Mki7++; - - M0 += 8 * ldm + 8; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++; - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; - - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -sludusolve (int ldm,int ncol,double *M,double *rhs ) -{ - double xj; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) - rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void sludmatvec (int ldm,int nrow,int ncol,double *M,double *vec,double *Mxvec ) -{ - double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; - double *M0; - register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - int k; - - M0 = &M[0]; - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - Mki4 = Mki3 + ldm; - Mki5 = Mki4 + ldm; - Mki6 = Mki5 + ldm; - Mki7 = Mki6 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - vi4 = vec[firstcol++]; - vi5 = vec[firstcol++]; - vi6 = vec[firstcol++]; - vi7 = vec[firstcol++]; - - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ - + vi4 * *Mki4++ + vi5 * *Mki5++ - + vi6 * *Mki6++ + vi7 * *Mki7++; - - M0 += 8 * ldm; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ ; - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++; - - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,271 +0,0 @@ -/* - This file has been modified to be compatible with the HYPRE - linear solver -*/ - - -#ifndef __SUPERLU_UTIL /* allow multiple inclusions */ -#define __SUPERLU_UTIL - -#include -#include -#include -/* -#ifndef __STDC__ -#include -#endif -*/ -#include - -/*********************************************************************** - * Macros - ***********************************************************************/ -#define FIRSTCOL_OF_SNODE(i) (xsup[i]) -/* No of marker arrays used in the symbolic factorization, - each of size n */ -#define NO_MARKER 3 -#define NUM_TEMPV(m,w,t,b) ( SUPERLU_MAX(m, (t + b)*w) ) - -#ifndef USER_ABORT -#define USER_ABORT(msg) superlu_abort_and_exit(msg) -#endif - -#define ABORT(err_msg) \ - { char msg[256];\ - sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\ - USER_ABORT(msg); } - - -#ifndef USER_MALLOC -#if 1 -#define USER_MALLOC(size) superlu_malloc(size) -#else -/* The following may check out some uninitialized data */ -#define USER_MALLOC(size) memset (superlu_malloc(size), '\x0F', size) -#endif -#endif - -#define SUPERLU_MALLOC(size) USER_MALLOC(size) - -#ifndef USER_FREE -#define USER_FREE(addr) superlu_free(addr) -#endif - -#define SUPERLU_FREE(addr) USER_FREE(addr) - -#define CHECK_MALLOC(where) { \ - extern int superlu_malloc_total; \ - printf("%s: malloc_total %d Bytes\n", \ - where, superlu_malloc_total); \ -} - -#define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) ) -#define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) ) - -/*********************************************************************** - * Constants - ***********************************************************************/ -#define EMPTY (-1) -/*#define NO (-1)*/ -#define FALSE 0 -#define TRUE 1 - -/*********************************************************************** - * Enumerate types - ***********************************************************************/ -typedef enum {NO, YES} yes_no_t; -typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t; -typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t; -typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, MY_PERMC}colperm_t; -typedef enum {NOTRANS, TRANS, CONJ} trans_t; -typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t; -typedef enum {NOREFINE, SINGLE=1, DOUBLE, EXTRA} IterRefine_t; -typedef enum {LUSUP, UCOL, LSUB, USUB} MemType; -typedef enum {HEAD, TAIL} stack_end_t; -typedef enum {SYSTEM, USER} LU_space_t; - -/* - * The following enumerate type is used by the statistics variable - * to keep track of flop count and time spent at various stages. - * - * Note that not all of the fields are disjoint. - */ -typedef enum { - COLPERM, /* find a column ordering that minimizes fills */ - RELAX, /* find artificial supernodes */ - ETREE, /* compute column etree */ - EQUIL, /* equilibrate the original matrix */ - FACT, /* perform LU factorization */ - RCOND, /* estimate reciprocal condition number */ - SOLVE, /* forward and back solves */ - REFINE, /* perform iterative refinement */ - TRSV, /* fraction of FACT spent in xTRSV */ - GEMV, /* fraction of FACT spent in xGEMV */ - FERR, /* estimate error bounds after iterative refinement */ - NPHASES /* total number of phases */ -} PhaseType; - - -/*********************************************************************** - * Type definitions - ***********************************************************************/ -typedef float flops_t; -typedef unsigned char Logical; - -/* - *-- This contains the options used to control the solve process. - * - * Fact (fact_t) - * Specifies whether or not the factored form of the matrix - * A is supplied on entry, and if not, how the matrix A should - * be factorizaed. - * = DOFACT: The matrix A will be factorized from scratch, and the - * factors will be stored in L and U. - * = SamePattern: The matrix A will be factorized assuming - * that a factorization of a matrix with the same sparsity - * pattern was performed prior to this one. Therefore, this - * factorization will reuse column permutation vector - * ScalePermstruct->perm_c and the column elimination tree - * LUstruct->etree. - * = SamePattern_SameRowPerm: The matrix A will be factorized - * assuming that a factorization of a matrix with the same - * sparsity pattern and similar numerical values was performed - * prior to this one. Therefore, this factorization will reuse - * both row and column scaling factors R and C, both row and - * column permutation vectors perm_r and perm_c, and the - * data structure set up from the previous symbolic factorization. - * = FACTORED: On entry, L, U, perm_r and perm_c contain the - * factored form of A. If DiagScale is not NOEQUIL, the matrix - * A has been equilibrated with scaling factors R and C. - * - * Equil (yes_no_t) - * Specifies whether to equilibrate the system (scale A's row and - * columns to have unit norm). - * - * ColPerm (colperm_t) - * Specifies what type of column permutation to use to reduce fill. - * = NATURAL: use the natural ordering - * = MMD_ATA: use minimum degree ordering on structure of A'*A - * = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A - * = COLAMD: use approximate minimum degree column ordering - * = MY_PERMC: use the ordering specified in ScalePermstruct->perm_c[] - * - * Trans (trans_t) - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A**T * X = B (Transpose) - * = CONJ: A**H * X = B (Transpose) - * - * IterRefine (IterRefine_t) - * Specifies whether to perform iterative refinement. - * = NO: no iterative refinement - * = WorkingPrec: perform iterative refinement in working precision - * = ExtraPrec: perform iterative refinement in extra precision - * - * PrintStat (yes_no_t) - * Specifies whether to print the solver's statistics. - * - * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU) - * Specifies the threshold used for a diagonal entry to be an - * acceptable pivot. - * - * PivotGrowth (yes_no_t) - * Specifies whether to compute the reciprocal pivot growth. - * - * ConditionNumber (ues_no_t) - * Specifies whether to compute the reciprocal condition number. - * - * RowPerm (rowperm_t) (only for SuperLU_DIST) - * Specifies whether to permute rows of the original matrix. - * = NO: not to permute the rows - * = LargeDiag: make the diagonal large relative to the off-diagonal - * = MY_PERMR: use the permutation given in ScalePermstruct->perm_r[] - * - * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST) - * Specifies whether to replace the tiny diagonals by - * sqrt(epsilon)*||A|| during LU factorization. - * - * SolveInitialized (yes_no_t) (only for SuperLU_DIST) - * Specifies whether the initialization has been performed to the - * triangular solve. - * - * RefineInitialized (yes_no_t) (only for SuperLU_DIST) - * Specifies whether the initialization has been performed to the - * sparse matrix-vector multiplication routine needed in iterative - * refinement. - */ -typedef struct { - fact_t Fact; - yes_no_t Equil; - colperm_t ColPerm; - trans_t Trans; - IterRefine_t IterRefine; - yes_no_t PrintStat; - yes_no_t SymmetricMode; - double DiagPivotThresh; - yes_no_t PivotGrowth; - yes_no_t ConditionNumber; - rowperm_t RowPerm; - yes_no_t ReplaceTinyPivot; - yes_no_t SolveInitialized; - yes_no_t RefineInitialized; -} superlu_options_t; - -typedef struct { - int *panel_histo; /* histogram of panel size distribution */ - double *utime; /* running time at various phases */ - flops_t *ops; /* operation count at various phases */ - int TinyPivots; /* number of tiny pivots */ - int RefineSteps; /* number of iterative refinement steps */ -} SuperLUStat_t; - - -/*********************************************************************** - * Prototypes - ***********************************************************************/ -#ifdef __cplusplus -extern "C" { -#endif - -extern void Destroy_SuperMatrix_Store(SuperMatrix *); -extern void Destroy_CompCol_Matrix(SuperMatrix *); -extern void Destroy_CompRow_Matrix(SuperMatrix *); -extern void Destroy_SuperNode_Matrix(SuperMatrix *); -extern void Destroy_CompCol_Permuted(SuperMatrix *); -extern void Destroy_Dense_Matrix(SuperMatrix *); -extern void get_perm_c(int, SuperMatrix *, int *); -extern void set_default_options(superlu_options_t *options); -extern void sp_preorder (superlu_options_t *, SuperMatrix*, int*, int*, - SuperMatrix*); -extern void superlu_abort_and_exit(char*); -extern void *superlu_malloc (size_t); -extern int *intMalloc (int); -extern int *intCalloc (int); -extern void superlu_free (void*); -extern void SetIWork (int, int, int, int *, int **, int **, int **, - int **, int **, int **, int **); -extern int sp_coletree (int *, int *, int *, int, int, int *); -extern void relax_snode (const int, int *, const int, int *, int *); -extern void heap_relax_snode (const int, int *, const int, int *, int *); -extern void resetrep_col (const int, const int *, int *); -extern int spcoletree (int *, int *, int *, int, int, int *); -extern int *TreePostorder (int, int *); -extern double SuperLU_timer_ (); -extern int sp_ienv (int); -extern int superlu_lsame (const char *,const char *); -extern int superlu_xerbla (const char *, int *); -extern void ifill (int *, int, int); -extern void snode_profile (int, int *); -extern void super_stats (int, int *); -extern void PrintSumm (char *, int, int, int); -extern void StatInit(SuperLUStat_t *); -extern void StatPrint (SuperLUStat_t *); -extern void StatFree(SuperLUStat_t *); -extern void print_panel_seg(int, int, int, int, int *, int *); -extern void check_repfnz(int, int, int, int *); - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_UTIL */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_zdefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_zdefs.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_zdefs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_zdefs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,237 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#ifndef __SUPERLU_zSP_DEFS /* allow multiple inclusions */ -#define __SUPERLU_zSP_DEFS - -/* - * File name: zsp_defs.h - * Purpose: Sparse matrix types and function prototypes - * History: - */ - -#ifdef _CRAY -#include -#include -#endif - -/* Define my integer type int_t */ -typedef int int_t; /* default */ - -#include "slu_Cnames.h" -#include "supermatrix.h" -#include "slu_util.h" -#include "slu_dcomplex.h" - - -/* - * Global data structures used in LU factorization - - * - * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper]. - * (xsup,supno): supno[i] is the supernode no to which i belongs; - * xsup(s) points to the beginning of the s-th supernode. - * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12) - * xsup 0 1 2 4 7 12 - * Note: dfs will be performed on supernode rep. relative to the new - * row pivoting ordering - * - * (xlsub,lsub): lsub[*] contains the compressed subscript of - * rectangular supernodes; xlsub[j] points to the starting - * location of the j-th column in lsub[*]. Note that xlsub - * is indexed by column. - * Storage: original row subscripts - * - * During the course of sparse LU factorization, we also use - * (xlsub,lsub) for the purpose of symmetric pruning. For each - * supernode {s,s+1,...,t=s+r} with first column s and last - * column t, the subscript set - * lsub[j], j=xlsub[s], .., xlsub[s+1]-1 - * is the structure of column s (i.e. structure of this supernode). - * It is used for the storage of numerical values. - * Furthermore, - * lsub[j], j=xlsub[t], .., xlsub[t+1]-1 - * is the structure of the last column t of this supernode. - * It is for the purpose of symmetric pruning. Therefore, the - * structural subscripts can be rearranged without making physical - * interchanges among the numerical values. - * - * However, if the supernode has only one column, then we - * only keep one set of subscripts. For any subscript interchange - * performed, similar interchange must be done on the numerical - * values. - * - * The last column structures (for pruning) will be removed - * after the numercial LU factorization phase. - * - * (xlusup,lusup): lusup[*] contains the numerical values of the - * rectangular supernodes; xlusup[j] points to the starting - * location of the j-th column in storage vector lusup[*] - * Note: xlusup is indexed by column. - * Each rectangular supernode is stored by column-major - * scheme, consistent with Fortran 2-dim array storage. - * - * (xusub,ucol,usub): ucol[*] stores the numerical values of - * U-columns outside the rectangular supernodes. The row - * subscript of nonzero ucol[k] is stored in usub[k]. - * xusub[i] points to the starting location of column i in ucol. - * Storage: new row subscripts; that is subscripts of PA. - */ -typedef struct { - int *xsup; /* supernode and column mapping */ - int *supno; - int *lsub; /* compressed L subscripts */ - int *xlsub; - doublecomplex *lusup; /* L supernodes */ - int *xlusup; - doublecomplex *ucol; /* U columns */ - int *usub; - int *xusub; - int nzlmax; /* current max size of lsub */ - int nzumax; /* " " " ucol */ - int nzlumax; /* " " " lusup */ - int n; /* number of columns in the matrix */ - LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ -} GlobalLU_t; - -typedef struct { - float for_lu; - float total_needed; - int expansions; -} mem_usage_t; - -#ifdef __cplusplus -extern "C" { -#endif - -/* Driver routines */ -extern void -zgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); -extern void -zgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, - char *, double *, double *, SuperMatrix *, SuperMatrix *, - void *, int, SuperMatrix *, SuperMatrix *, - double *, double *, double *, double *, - mem_usage_t *, SuperLUStat_t *, int *); - -/* Supernodal LU factor related */ -extern void -zCreate_CompCol_Matrix(SuperMatrix *, int, int, int, doublecomplex *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -zCreate_CompRow_Matrix(SuperMatrix *, int, int, int, doublecomplex *, - int *, int *, Stype_t, Dtype_t, Mtype_t); -extern void -zCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); -extern void -zCreate_Dense_Matrix(SuperMatrix *, int, int, doublecomplex *, int, - Stype_t, Dtype_t, Mtype_t); -extern void -zCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, doublecomplex *, - int *, int *, int *, int *, int *, - Stype_t, Dtype_t, Mtype_t); -extern void -zCopy_Dense_Matrix(int, int, doublecomplex *, int, doublecomplex *, int); - -extern void countnz (const int, int *, int *, int *, GlobalLU_t *); -extern void fixupL (const int, const int *, GlobalLU_t *); - -extern void zallocateA (int, int, doublecomplex **, int **, int **); -extern void zgstrf (superlu_options_t*, SuperMatrix*, double, - int, int, int*, void *, int, int *, int *, - SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); -extern int zsnode_dfs (const int, const int, const int *, const int *, - const int *, int *, int *, GlobalLU_t *); -extern int zsnode_bmod (const int, const int, const int, doublecomplex *, - doublecomplex *, GlobalLU_t *, SuperLUStat_t*); -extern void zpanel_dfs (const int, const int, const int, SuperMatrix *, - int *, int *, doublecomplex *, int *, int *, int *, - int *, int *, int *, int *, GlobalLU_t *); -extern void zpanel_bmod (const int, const int, const int, const int, - doublecomplex *, doublecomplex *, int *, int *, - GlobalLU_t *, SuperLUStat_t*); -extern int zcolumn_dfs (const int, const int, int *, int *, int *, int *, - int *, int *, int *, int *, int *, GlobalLU_t *); -extern int zcolumn_bmod (const int, const int, doublecomplex *, - doublecomplex *, int *, int *, int, - GlobalLU_t *, SuperLUStat_t*); -extern int zcopy_to_ucol (int, int, int *, int *, int *, - doublecomplex *, GlobalLU_t *); -extern int zpivotL (const int, const double, int *, int *, - int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); -extern void zpruneL (const int, const int *, const int, const int, - const int *, const int *, int *, GlobalLU_t *); -extern void zreadmt (int *, int *, int *, doublecomplex **, int **, int **); -extern void zGenXtrue (int, int, doublecomplex *, int); -extern void zFillRHS (trans_t, int, doublecomplex *, int, SuperMatrix *, - SuperMatrix *); -extern void zgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, - SuperMatrix *, SuperLUStat_t*, int *); - - -/* Driver related */ - -extern void zgsequ (SuperMatrix *, double *, double *, double *, - double *, double *, int *); -extern void zlaqgs (SuperMatrix *, double *, double *, double, - double, double, char *); -extern void zgscon (char *, SuperMatrix *, SuperMatrix *, - double, double *, SuperLUStat_t*, int *); -extern double zPivotGrowth(int, SuperMatrix *, int *, - SuperMatrix *, SuperMatrix *); -extern void zgsrfs (trans_t, SuperMatrix *, SuperMatrix *, - SuperMatrix *, int *, int *, char *, double *, - double *, SuperMatrix *, SuperMatrix *, - double *, double *, SuperLUStat_t*, int *); - -extern int sp_ztrsv (char *, char *, char *, SuperMatrix *, - SuperMatrix *, doublecomplex *, SuperLUStat_t*, int *); -extern int sp_zgemv (char *, doublecomplex, SuperMatrix *, doublecomplex *, - int, doublecomplex, doublecomplex *, int); - -extern int sp_zgemm (char *, char *, int, int, int, doublecomplex, - SuperMatrix *, doublecomplex *, int, doublecomplex, - doublecomplex *, int); - -/* Memory-related */ -extern int zLUMemInit (fact_t, void *, int, int, int, int, int, - SuperMatrix *, SuperMatrix *, - GlobalLU_t *, int **, doublecomplex **); -extern void zSetRWork (int, int, doublecomplex *, doublecomplex **, doublecomplex **); -extern void zLUWorkFree (int *, doublecomplex *, GlobalLU_t *); -extern int zLUMemXpand (int, int, MemType, int *, GlobalLU_t *); - -extern doublecomplex *doublecomplexMalloc(int); -extern doublecomplex *doublecomplexCalloc(int); -extern double *doubleMalloc(int); -extern double *doubleCalloc(int); -extern int zmemory_usage(const int, const int, const int, const int); -extern int zQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); - -/* Auxiliary routines */ -extern void zreadhb(int *, int *, int *, doublecomplex **, int **, int **); -extern void zCompRow_to_CompCol(int, int, int, doublecomplex*, int*, int*, - doublecomplex **, int **, int **); -extern void zfill (doublecomplex *, int, doublecomplex); -extern void zinf_norm_error (int, SuperMatrix *, doublecomplex *); -extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, - doublecomplex, doublecomplex, doublecomplex *, doublecomplex *, char *); - -/* Routines for debugging */ -extern void zPrint_CompCol_Matrix(char *, SuperMatrix *); -extern void zPrint_SuperNode_Matrix(char *, SuperMatrix *); -extern void zPrint_Dense_Matrix(char *, SuperMatrix *); -extern void print_lu_col(char *, int, int, int *, GlobalLU_t *); -extern void check_tempv(int, doublecomplex *); - -#ifdef __cplusplus - } -#endif - -#endif /* __SUPERLU_zSP_DEFS */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smemory.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smemory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smemory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,680 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_sdefs.h" - -/* Constants */ -#define NO_MEMTYPE 4 /* 0: lusup; - 1: ucol; - 2: lsub; - 3: usub */ -#define GluIntArray(n) (5 * (n) + 5) - -/* Internal prototypes */ -void *sexpand (int *, MemType,int, int, GlobalLU_t *); -int sLUWorkInit (int, int, int, int **, float **, LU_space_t); -void copy_mem_float (int, void *, void *); -void sStackCompress (GlobalLU_t *); -void sSetupSpace (void *, int, LU_space_t *); -void *suser_malloc (int, int); -void suser_free (int, int); - -/* External prototypes (in memory.c - prec-indep) */ -extern void copy_mem_int (int, void *, void *); -extern void user_bcopy (char *, char *, int); - -/* Headers for 4 types of dynamatically managed memory */ -typedef struct e_node { - int size; /* length of the memory that has been used */ - void *mem; /* pointer to the new malloc'd store */ -} ExpHeader; - -typedef struct { - int size; - int used; - int top1; /* grow upward, relative to &array[0] */ - int top2; /* grow downward */ - void *array; -} LU_stack_t; - -/* Variables local to this file */ -static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */ -static LU_stack_t stack; -static int no_expand; - -/* Macros to manipulate stack */ -#define StackFull(x) ( x + stack.used >= stack.size ) -#define NotDoubleAlign(addr) ( (long int)addr & 7 ) -#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) -#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ - (w + 1) * m * sizeof(float) ) -#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ - - - - -/* - * Setup the memory model to be used for factorization. - * lwork = 0: use system malloc; - * lwork > 0: use user-supplied work[] space. - */ -void sSetupSpace(void *work, int lwork, LU_space_t *MemModel) -{ - if ( lwork == 0 ) { - *MemModel = SYSTEM; /* malloc/free */ - } else if ( lwork > 0 ) { - *MemModel = USER; /* user provided space */ - stack.used = 0; - stack.top1 = 0; - stack.top2 = (lwork/4)*4; /* must be word addressable */ - stack.size = stack.top2; - stack.array = (void *) work; - } -} - - - -void *suser_malloc(int bytes, int which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - - -void suser_free(int bytes, int which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int sQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) -{ - SCformat *Lstore; - NCformat *Ustore; - register int n, iword, dword, panel_size = sp_ienv(1); - - Lstore = L->Store; - Ustore = U->Store; - n = L->ncol; - iword = sizeof(int); - dword = sizeof(float); - - /* For LU factors */ - mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] * - dword + Lstore->rowind_colptr[n] * iword ); - mem_usage->for_lu += (float)( (n + 1) * iword + - Ustore->colptr[n] * (dword + iword) ); - - /* Working storage to support factorization */ - mem_usage->total_needed = mem_usage->for_lu + - (float)( (2 * panel_size + 4 + NO_MARKER) * n * iword + - (panel_size + 1) * n * dword ); - - mem_usage->expansions = --no_expand; - - return 0; -} /* sQuerySpace */ - -/* - * Allocate storage for the data structures common to all factor routines. - * For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * If lwork = -1, return the estimated amount of space required, plus n; - * otherwise, return the amount of space actually allocated when - * memory allocation failure occurred. - */ -int -sLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, - int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, - int **iwork, float **dwork) -{ - int info, iword, dword; - SCformat *Lstore; - NCformat *Ustore; - int *xsup, *supno; - int *lsub, *xlsub; - float *lusup; - int *xlusup; - float *ucol; - int *usub, *xusub; - int nzlmax, nzumax, nzlumax; - int FILL = sp_ienv(6); - - Glu->n = n; - no_expand = 0; - iword = sizeof(int); - dword = sizeof(float); - - if ( !expanders ) - expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader)); - if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); - - if ( fact != SamePattern_SameRowPerm ) { - /* Guess for L\U factors */ - nzumax = nzlumax = FILL * annz; - nzlmax = SUPERLU_MAX(1, FILL/4.) * annz; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else { - sSetupSpace(work, lwork, &Glu->MemModel); - } - -#if ( PRNTlevel >= 1 ) - printf("sLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", - FILL, nzlmax, nzumax); - fflush(stdout); -#endif - - /* Integer pointers for L\U factors */ - if ( Glu->MemModel == SYSTEM ) { - xsup = intMalloc(n+1); - supno = intMalloc(n+1); - xlsub = intMalloc(n+1); - xlusup = intMalloc(n+1); - xusub = intMalloc(n+1); - } else { - xsup = (int *)suser_malloc((n+1) * iword, HEAD); - supno = (int *)suser_malloc((n+1) * iword, HEAD); - xlsub = (int *)suser_malloc((n+1) * iword, HEAD); - xlusup = (int *)suser_malloc((n+1) * iword, HEAD); - xusub = (int *)suser_malloc((n+1) * iword, HEAD); - } - - lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) sexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) sexpand( &nzumax, USUB, 0, 1, Glu ); - - while ( !lusup || !ucol || !lsub || !usub ) { - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE(lusup); - SUPERLU_FREE(ucol); - SUPERLU_FREE(lsub); - SUPERLU_FREE(usub); - } else { - suser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD); - } - nzlumax /= 2; - nzumax /= 2; - nzlmax /= 2; - if ( nzlumax < annz ) { - printf("Not enough memory to perform factorization.\n"); - return (smemory_usage(nzlmax, nzumax, nzlumax, n) + n); - } -#if ( PRNTlevel >= 1) - printf("sLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", - nzlmax, nzumax); - fflush(stdout); -#endif - lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) sexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) sexpand( &nzumax, USUB, 0, 1, Glu ); - } - - } else { - /* fact == SamePattern_SameRowPerm */ - Lstore = L->Store; - Ustore = U->Store; - xsup = Lstore->sup_to_col; - supno = Lstore->col_to_sup; - xlsub = Lstore->rowind_colptr; - xlusup = Lstore->nzval_colptr; - xusub = Ustore->colptr; - nzlmax = Glu->nzlmax; /* max from previous factorization */ - nzumax = Glu->nzumax; - nzlumax = Glu->nzlumax; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else if ( lwork == 0 ) { - Glu->MemModel = SYSTEM; - } else { - Glu->MemModel = USER; - stack.top2 = (lwork/4)*4; /* must be word-addressable */ - stack.size = stack.top2; - } - - lsub = expanders[LSUB].mem = Lstore->rowind; - lusup = expanders[LUSUP].mem = Lstore->nzval; - usub = expanders[USUB].mem = Ustore->rowind; - ucol = expanders[UCOL].mem = Ustore->nzval;; - expanders[LSUB].size = nzlmax; - expanders[LUSUP].size = nzlumax; - expanders[USUB].size = nzumax; - expanders[UCOL].size = nzumax; - } - - Glu->xsup = xsup; - Glu->supno = supno; - Glu->lsub = lsub; - Glu->xlsub = xlsub; - Glu->lusup = lusup; - Glu->xlusup = xlusup; - Glu->ucol = ucol; - Glu->usub = usub; - Glu->xusub = xusub; - Glu->nzlmax = nzlmax; - Glu->nzumax = nzumax; - Glu->nzlumax = nzlumax; - - info = sLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel); - if ( info ) - return ( info + smemory_usage(nzlmax, nzumax, nzlumax, n) + n); - - ++no_expand; - return 0; - -} /* sLUMemInit */ - -/* Allocate known working storage. Returns 0 if success, otherwise - returns the number of bytes allocated so far when failure occurred. */ -int -sLUWorkInit(int m, int n, int panel_size, int **iworkptr, - float **dworkptr, LU_space_t MemModel) -{ - int isize, dsize, extra; - float *old_ptr; - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - - isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); - dsize = (m * panel_size + - NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(float); - - if ( MemModel == SYSTEM ) - *iworkptr = (int *) intCalloc(isize/sizeof(int)); - else - *iworkptr = (int *) suser_malloc(isize, TAIL); - if ( ! *iworkptr ) { - fprintf(stderr, "sLUWorkInit: malloc fails for local iworkptr[]\n"); - return (isize + n); - } - - if ( MemModel == SYSTEM ) - *dworkptr = (float *) SUPERLU_MALLOC(dsize); - else { - *dworkptr = (float *) suser_malloc(dsize, TAIL); - if ( NotDoubleAlign(*dworkptr) ) { - old_ptr = *dworkptr; - *dworkptr = (float*) DoubleAlign(*dworkptr); - *dworkptr = (float*) ((double*)*dworkptr - 1); - extra = (char*)old_ptr - (char*)*dworkptr; -#ifdef DEBUG - printf("sLUWorkInit: not aligned, extra %d\n", extra); -#endif - stack.top2 -= extra; - stack.used += extra; - } - } - if ( ! *dworkptr ) { - fprintf(stderr, "malloc fails for local dworkptr[]."); - return (isize + dsize + n); - } - - return 0; -} - - -/* - * Set up pointers for real working arrays. - */ -void -sSetRWork(int m, int panel_size, float *dworkptr, - float **dense, float **tempv) -{ - float zero = 0.0; - - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - *dense = dworkptr; - *tempv = *dense + panel_size*m; - sfill (*dense, m * panel_size, zero); - sfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); -} - -/* - * Free the working storage used by factor routines. - */ -void sLUWorkFree(int *iwork, float *dwork, GlobalLU_t *Glu) -{ - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE (iwork); - SUPERLU_FREE (dwork); - } else { - stack.used -= (stack.size - stack.top2); - stack.top2 = stack.size; -/* sStackCompress(Glu); */ - } - - SUPERLU_FREE (expanders); - expanders = 0; -} - -/* Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -sLUMemXpand(int jcol, - int next, /* number of elements currently in the factors */ - MemType mem_type, /* which type of memory to expand */ - int *maxlen, /* modified - maximum length of a data structure */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - void *new_mem; - -#ifdef DEBUG - printf("sLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", - jcol, next, *maxlen, mem_type); -#endif - - if (mem_type == USUB) - new_mem = sexpand(maxlen, mem_type, next, 1, Glu); - else - new_mem = sexpand(maxlen, mem_type, next, 0, Glu); - - if ( !new_mem ) { - int nzlmax = Glu->nzlmax; - int nzumax = Glu->nzumax; - int nzlumax = Glu->nzlumax; - fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); - return (smemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); - } - - switch ( mem_type ) { - case LUSUP: - Glu->lusup = (float *) new_mem; - Glu->nzlumax = *maxlen; - break; - case UCOL: - Glu->ucol = (float *) new_mem; - Glu->nzumax = *maxlen; - break; - case LSUB: - Glu->lsub = (int *) new_mem; - Glu->nzlmax = *maxlen; - break; - case USUB: - Glu->usub = (int *) new_mem; - Glu->nzumax = *maxlen; - break; - } - - return 0; - -} - - - -void -copy_mem_float(int howmany, void *old, void *new) -{ - register int i; - float *dold = old; - float *dnew = new; - for (i = 0; i < howmany; i++) dnew[i] = dold[i]; -} - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -void -*sexpand ( - int *prev_len, /* length used from previous call */ - MemType type, /* which part of the memory to expand */ - int len_to_copy, /* size of the memory to be copied to new store */ - int keep_prev, /* = 1: use prev_len; - = 0: compute new_len to expand */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - float EXPAND = 1.5; - float alpha; - void *new_mem, *old_mem; - int new_len, tries, lword, extra, bytes_to_copy; - - alpha = EXPAND; - - if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ - new_len = *prev_len; - else { - new_len = alpha * *prev_len; - } - - if ( type == LSUB || type == USUB ) lword = sizeof(int); - else lword = sizeof(float); - - if ( Glu->MemModel == SYSTEM ) { - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - if ( no_expand != 0 ) { - tries = 0; - if ( keep_prev ) { - if ( !new_mem ) return (NULL); - } else { - while ( !new_mem ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - } - } - if ( type == LSUB || type == USUB ) { - copy_mem_int(len_to_copy, expanders[type].mem, new_mem); - } else { - copy_mem_float(len_to_copy, expanders[type].mem, new_mem); - } - SUPERLU_FREE (expanders[type].mem); - } - expanders[type].mem = (void *) new_mem; - - } else { /* MemModel == USER */ - if ( no_expand == 0 ) { - new_mem = suser_malloc(new_len * lword, HEAD); - if ( NotDoubleAlign(new_mem) && - (type == LUSUP || type == UCOL) ) { - old_mem = new_mem; - new_mem = (void *)DoubleAlign(new_mem); - extra = (char*)new_mem - (char*)old_mem; -#ifdef DEBUG - printf("expand(): not aligned, extra %d\n", extra); -#endif - stack.top1 += extra; - stack.used += extra; - } - expanders[type].mem = (void *) new_mem; - } - else { - tries = 0; - extra = (new_len - *prev_len) * lword; - if ( keep_prev ) { - if ( StackFull(extra) ) return (NULL); - } else { - while ( StackFull(extra) ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - extra = (new_len - *prev_len) * lword; - } - } - - if ( type != USUB ) { - new_mem = (void*)((char*)expanders[type + 1].mem + extra); - bytes_to_copy = (char*)stack.array + stack.top1 - - (char*)expanders[type + 1].mem; - user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); - - if ( type < USUB ) { - Glu->usub = expanders[USUB].mem = - (void*)((char*)expanders[USUB].mem + extra); - } - if ( type < LSUB ) { - Glu->lsub = expanders[LSUB].mem = - (void*)((char*)expanders[LSUB].mem + extra); - } - if ( type < UCOL ) { - Glu->ucol = expanders[UCOL].mem = - (void*)((char*)expanders[UCOL].mem + extra); - } - stack.top1 += extra; - stack.used += extra; - if ( type == UCOL ) { - stack.top1 += extra; /* Add same amount for USUB */ - stack.used += extra; - } - - } /* if ... */ - - } /* else ... */ - } - - expanders[type].size = new_len; - *prev_len = new_len; - if ( no_expand ) ++no_expand; - - return (void *) expanders[type].mem; - -} /* sexpand */ - - -/* - * Compress the work[] array to remove fragmentation. - */ -void -sStackCompress(GlobalLU_t *Glu) -{ - register int iword, dword, ndim; - char *last, *fragment; - int *ifrom, *ito; - float *dfrom, *dto; - int *xlsub, *lsub, *xusub, *usub, *xlusup; - float *ucol, *lusup; - - iword = sizeof(int); - dword = sizeof(float); - ndim = Glu->n; - - xlsub = Glu->xlsub; - lsub = Glu->lsub; - xusub = Glu->xusub; - usub = Glu->usub; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - lusup = Glu->lusup; - - dfrom = ucol; - dto = (float *)((char*)lusup + xlusup[ndim] * dword); - copy_mem_float(xusub[ndim], dfrom, dto); - ucol = dto; - - ifrom = lsub; - ito = (int *) ((char*)ucol + xusub[ndim] * iword); - copy_mem_int(xlsub[ndim], ifrom, ito); - lsub = ito; - - ifrom = usub; - ito = (int *) ((char*)lsub + xlsub[ndim] * iword); - copy_mem_int(xusub[ndim], ifrom, ito); - usub = ito; - - last = (char*)usub + xusub[ndim] * iword; - fragment = (char*) (((char*)stack.array + stack.top1) - last); - stack.used -= (long int) fragment; - stack.top1 -= (long int) fragment; - - Glu->ucol = ucol; - Glu->lsub = lsub; - Glu->usub = usub; - -#ifdef DEBUG - printf("sStackCompress: fragment %d\n", fragment); - /* for (last = 0; last < ndim; ++last) - print_lu_col("After compress:", last, 0);*/ -#endif - -} - -/* - * Allocate storage for original matrix A - */ -void -sallocateA(int n, int nnz, float **a, int **asub, int **xa) -{ - *a = (float *) floatMalloc(nnz); - *asub = (int *) intMalloc(nnz); - *xa = (int *) intMalloc(n+1); -} - - -float *floatMalloc(int n) -{ - float *buf; - buf = (float *) SUPERLU_MALLOC((size_t)n * sizeof(float)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in floatMalloc()\n"); - } - return (buf); -} - -float *floatCalloc(int n) -{ - float *buf; - register int i; - float zero = 0.0; - buf = (float *) SUPERLU_MALLOC((size_t)n * sizeof(float)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in floatCalloc()\n"); - } - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - - -int smemory_usage(const int nzlmax, const int nzumax, - const int nzlumax, const int n) -{ - register int iword, dword; - - iword = sizeof(int); - dword = sizeof(float); - - return (10 * n * iword + - nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smyblas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smyblas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smyblas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,224 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: smyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void slsolve ( int ldm, int ncol, float *M, float *rhs ) -{ - int k; - float x0, x1, x2, x3, x4, x5, x6, x7; - float *M0; - register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - - M0 = &M[0]; - - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - Mki4 = Mki3 + ldm + 1; - Mki5 = Mki4 + ldm + 1; - Mki6 = Mki5 + ldm + 1; - Mki7 = Mki6 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++; - x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++; - x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; - x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - rhs[++firstcol] = x4; - rhs[++firstcol] = x5; - rhs[++firstcol] = x6; - rhs[++firstcol] = x7; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++ - - x4 * *Mki4++ - x5 * *Mki5++ - - x6 * *Mki6++ - x7 * *Mki7++; - - M0 += 8 * ldm + 8; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; - x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - - x2 * *Mki2++ - x3 * *Mki3++; - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - x1 = rhs[firstcol+1] - x0 * *Mki0++; - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) - rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; - - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -susolve ( ldm, ncol, M, rhs ) -int ldm; /* in */ -int ncol; /* in */ -float *M; /* in */ -float *rhs; /* modified */ -{ - float xj; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) - rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void smatvec ( ldm, nrow, ncol, M, vec, Mxvec ) - -int ldm; /* in -- leading dimension of M */ -int nrow; /* in */ -int ncol; /* in */ -float *M; /* in */ -float *vec; /* in */ -float *Mxvec; /* in/out */ - -{ - float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; - float *M0; - register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; - register int firstcol = 0; - int k; - - M0 = &M[0]; - while ( firstcol < ncol - 7 ) { /* Do 8 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - Mki4 = Mki3 + ldm; - Mki5 = Mki4 + ldm; - Mki6 = Mki5 + ldm; - Mki7 = Mki6 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - vi4 = vec[firstcol++]; - vi5 = vec[firstcol++]; - vi6 = vec[firstcol++]; - vi7 = vec[firstcol++]; - - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ - + vi4 * *Mki4++ + vi5 * *Mki5++ - + vi6 * *Mki6++ + vi7 * *Mki7++; - - M0 += 8 * ldm; - } - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ - + vi2 * *Mki2++ + vi3 * *Mki3++ ; - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) - Mxvec[k] += vi0 * *Mki0++; - - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,450 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_sdefs.h" - -/* - * Function prototypes - */ -void slsolve(int, int, float *, float *); -void smatvec(int, int, int, float *, float *, float *); -extern void scheck_tempv(); - -void -spanel_bmod ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - const int nseg, /* in */ - float *dense, /* out, of size n by w */ - float *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in, of size n by w */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * - * Performs numeric block updates (sup-panel) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - * Before entering this routine, the original nonzeros in the panel - * were already copied into the spa[m,w]. - * - * Updated/Output parameters- - * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned - * collectively in the m-by-w vector dense[*]. - * - */ - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - float alpha, beta; -#endif - - register int k, ksub; - int fsupc, nsupc, nsupr, nrow; - int krep, krep_ind; - float ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int segsze; - int block_nrow; /* no of rows in a block row */ - register int lptr; /* Points to the row subscripts of a supernode */ - int kfnz, irow, no_zeros; - register int isub, isub1, i; - register int jj; /* Index through each column in the panel */ - int *xsup, *supno; - int *lsub, *xlsub; - float *lusup; - int *xlusup; - int *repfnz_col; /* repfnz[] for a column in the panel */ - float *dense_col; /* dense[] for a column in the panel */ - float *tempv1; /* Used in 1-D update */ - float *TriTmp, *MatvecTmp; /* used in 2-D update */ - float zero = 0.0; - float one = 1.0; - register int ldaTmp; - register int r_ind, r_hi; - static int first = 1, maxsuper, rowblk, colblk; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - if ( first ) { - maxsuper = sp_ienv(3); - rowblk = sp_ienv(4); - colblk = sp_ienv(5); - first = 0; - } - ldaTmp = maxsuper + rowblk; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in a supernode - * nsupr = no of rows in a supernode - */ - krep = segrep[k--]; - fsupc = xsup[supno[krep]]; - nsupc = krep - fsupc + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nrow = nsupr - nsupc; - lptr = xlsub[fsupc]; - krep_ind = lptr + nsupc - 1; - - repfnz_col = repfnz; - dense_col = dense; - - if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ - - TriTmp = tempv; - - /* Sequence through each column in panel -- triangular solves */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += segsze * (segsze - 1); - ops[GEMV] += 2 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - dense_col[irow] -= ukj * lusup[luptr]; - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - ukj -= ukj1 * lusup[luptr1]; - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; - dense_col[irow] -= (ukj*lusup[luptr] - + ukj1*lusup[luptr1]); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - ukj1 -= ukj2 * lusup[luptr2-1]; - ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; luptr2++; - dense_col[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); - } - } - - } else { /* segsze >= 4 */ - - /* Copy U[*,j] segment from dense[*] to TriTmp[*], which - holds the result of triangular solves. */ - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - TriTmp[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#else - strsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#endif -#else - slsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); -#endif - - - } /* else ... */ - - } /* for jj ... end tri-solves */ - - /* Block row updates; push all the way into dense[*] block */ - for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { - - r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); - block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); - luptr = xlusup[fsupc] + nsupc + r_ind; - isub1 = lptr + nsupc + r_ind; - - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - /* Sequence through each column in panel -- matrix-vector */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - /* Perform a block update, and scatter the result of - matrix-vector to dense[]. */ - no_zeros = kfnz - fsupc; - luptr1 = luptr + nsupr * no_zeros; - MatvecTmp = &TriTmp[maxsuper]; - -#ifdef USE_VENDOR_BLAS - alpha = one; - beta = zero; -#ifdef _CRAY - SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#else - sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#endif -#else - smatvec(nsupr, block_nrow, segsze, &lusup[luptr1], - TriTmp, MatvecTmp); -#endif - - /* Scatter MatvecTmp[*] into SPA dense[*] temporarily - * such that MatvecTmp[*] can be re-used for the - * the next blok row update. dense[] will be copied into - * global store after the whole panel has been finished. - */ - isub = isub1; - for (i = 0; i < block_nrow; i++) { - irow = lsub[isub]; - dense_col[irow] -= MatvecTmp[i]; - MatvecTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } /* for each block row ... */ - - /* Scatter the triangular solves into SPA dense[*] */ - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = TriTmp[i]; - TriTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } else { /* 1-D block modification */ - - - /* Sequence through each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += segsze * (segsze - 1); - ops[GEMV] += 2 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - dense_col[irow] -= ukj * lusup[luptr]; - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - ukj -= ukj1 * lusup[luptr1]; - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; - dense_col[irow] -= (ukj*lusup[luptr] - + ukj1*lusup[luptr1]); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - ukj1 -= ukj2 * lusup[luptr2-1]; - ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; ++luptr2; - dense_col[irow] -= ( ukj*lusup[luptr] - + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); - } - } - - } else { /* segsze >= 4 */ - /* - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense[]. - */ - no_zeros = kfnz - fsupc; - - /* Copy U[*,j] segment from dense[*] to tempv[*]: - * The result of triangular solve is in tempv[*]; - * The result of matrix vector update is in dense_col[*] - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - tempv[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - strsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#endif - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - slsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); -#endif - - /* Scatter tempv[*] into SPA dense[*] temporarily, such - * that tempv[*] can be used for the triangular solve of - * the next column of the panel. They will be copied into - * ucol[*] after the whole panel has been finished. - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = tempv[i]; - tempv[i] = zero; - isub++; - } - - /* Scatter the update from tempv1[*] into SPA dense[*] */ - /* Start dense rectangular L */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - dense_col[irow] -= tempv1[i]; - tempv1[i] = zero; - ++isub; - } - - } /* else segsze>=4 ... */ - - } /* for each column in the panel... */ - - } /* else 1-D update ... */ - - } /* for each updating supernode ... */ - -} - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,247 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - -void -spanel_dfs ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - SuperMatrix *A, /* in - original matrix */ - int *perm_r, /* in */ - int *nseg, /* out */ - float *dense, /* out */ - int *panel_lsub, /* out */ - int *segrep, /* out */ - int *repfnz, /* out */ - int *xprune, /* out */ - int *marker, /* out */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * - * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. - * - * The routine returns one list of the supernodal representatives - * in topological order of the dfs that generates them. This list is - * a superset of the topological order of each individual column within - * the panel. - * The location of the first nonzero in each supernodal segment - * (supernodal entry location) is also returned. Each column has a - * separate list for this purpose. - * - * Two marker arrays are used for dfs: - * marker[i] == jj, if i was visited during dfs of current column jj; - * marker1[i] >= jcol, if i was visited by earlier columns in this panel; - * - * marker: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - */ - NCPformat *Astore; - float *a; - int *asub; - int *xa_begin, *xa_end; - int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; - int k, krow, kmark, kperm; - int xdfs, maxdfs, kpar; - int jj; /* index through each column in the panel */ - int *marker1; /* marker1[jj] >= jcol if vertex jj was visited - by a previous column within this panel. */ - int *repfnz_col; /* start of each column in the panel */ - float *dense_col; /* start of each column in the panel */ - int nextl_col; /* next available position in panel_lsub[*,jj] */ - int *xsup, *supno; - int *lsub, *xlsub; - - /* Initialize pointers */ - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - marker1 = marker + m; - repfnz_col = repfnz; - dense_col = dense; - *nseg = 0; - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - - /* For each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++) { - nextl_col = (jj - jcol) * m; - -#ifdef CHK_DFS - printf("\npanel col %d: ", jj); -#endif - - /* For each nonz in A[*,jj] do dfs */ - for (k = xa_begin[jj]; k < xa_end[jj]; k++) { - krow = asub[k]; - dense_col[krow] = a[k]; - kmark = marker[krow]; - if ( kmark == jj ) - continue; /* krow visited before, go to the next nonzero */ - - /* For each unmarked nbr krow of jj - * krow is in L: place it in structure of L[*,jj] - */ - marker[krow] = jj; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ - } - /* - * krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - else { - - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz_col[krep]; - -#ifdef CHK_DFS - printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); -#endif - if ( myfnz != EMPTY ) { /* Representative visited before */ - if ( myfnz > kperm ) repfnz_col[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz_col[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker[kchild]; - - if ( chmark != jj ) { /* Not reached yet */ - marker[kchild] = jj; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,j] */ - if ( chperm == EMPTY ) { - panel_lsub[nextl_col++] = kchild; - } - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - else { - - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz_col[chrep]; -#ifdef CHK_DFS - printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); -#endif - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz_col[chrep] = chperm; - } - else { - /* Cont. dfs at snode-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L) */ - parent[krep] = oldrep; - repfnz_col[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } /* else */ - - } /* else */ - - } /* if... */ - - } /* while xdfs < maxdfs */ - - /* krow has no more unexplored nbrs: - * Place snode-rep krep in postorder DFS, if this - * segment is seen for the first time. (Note that - * "repfnz[krep]" may change later.) - * Backtrack dfs to its parent. - */ - if ( marker1[krep] < jcol ) { - segrep[*nseg] = krep; - ++(*nseg); - marker1[krep] = jj; - } - - kpar = parent[krep]; /* Pop stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } while ( kpar != EMPTY ); /* do-while - until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonz in A[*,jj] */ - - repfnz_col += m; /* Move to next column */ - dense_col += m; - - } /* for jj ... */ - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_coletree.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_coletree.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_coletree.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_coletree.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,332 +0,0 @@ - -/* Elimination tree computation and layout routines */ - -#include -#include -#include "slu_ddefs.h" - -/* - * Implementation of disjoint set union routines. - * Elements are integers in 0..n-1, and the - * names of the sets themselves are of type int. - * - * Calls are: - * initialize_disjoint_sets (n) initial call. - * s = make_set (i) returns a set containing only i. - * s = splink (t, u) returns s = t union u, destroying t and u. - * s = find (i) return name of set containing i. - * finalize_disjoint_sets final call. - * - * This implementation uses path compression but not weighted union. - * See Tarjan's book for details. - * John Gilbert, CMI, 1987. - * - * Implemented path-halving by XSL 07/05/95. - */ - -static int *pp; /* parent array for sets */ - -static -int *mxCallocInt(int n) -{ - register int i; - int *buf; - - buf = (int *) SUPERLU_MALLOC( n * sizeof(int) ); - if ( !buf ) { - ABORT("SUPERLU_MALLOC fails for buf in mxCallocInt()"); - } - for (i = 0; i < n; i++) buf[i] = 0; - return (buf); -} - -static -void initialize_disjoint_sets ( - int n - ) -{ - pp = mxCallocInt(n); -} - - -static -int make_set ( - int i - ) -{ - pp[i] = i; - return i; -} - - -static -int splink ( - int s, - int t - ) -{ - pp[s] = t; - return t; -} - - -/* PATH HALVING */ -static -int find (int i) -{ - register int p, gp; - - p = pp[i]; - gp = pp[p]; - while (gp != p) { - pp[i] = gp; - i = gp; - p = pp[i]; - gp = pp[p]; - } - return (p); -} - -#if 0 -/* PATH COMPRESSION */ -static -int find ( - int i - ) -{ - if (pp[i] != i) - pp[i] = find (pp[i]); - return pp[i]; -} -#endif - -static -void finalize_disjoint_sets ( - void - ) -{ - SUPERLU_FREE(pp); -} - - -/* - * Find the elimination tree for A'*A. - * This uses something similar to Liu's algorithm. - * It runs in time O(nz(A)*log n) and does not form A'*A. - * - * Input: - * Sparse matrix A. Numeric values are ignored, so any - * explicit zeros are treated as nonzero. - * Output: - * Integer array of parents representing the elimination - * tree of the symbolic product A'*A. Each vertex is a - * column of A, and nc means a root of the elimination forest. - * - * John R. Gilbert, Xerox, 10 Dec 1990 - * Based on code by JRG dated 1987, 1988, and 1990. - */ - -/* - * Nonsymmetric elimination tree - */ -int -sp_coletree( - int *acolst, int *acolend, /* column start and end past 1 */ - int *arow, /* row indices of A */ - int nr, int nc, /* dimension of A */ - int *parent /* parent in elim tree */ - ) -{ - int *root; /* root of subtee of etree */ - int *firstcol; /* first nonzero col in each row*/ - int rset, cset; - int row, col; - int rroot; - int p; - - root = mxCallocInt (nc); - initialize_disjoint_sets (nc); - - /* Compute firstcol[row] = first nonzero column in row */ - - firstcol = mxCallocInt (nr); - for (row = 0; row < nr; firstcol[row++] = nc); - for (col = 0; col < nc; col++) - for (p = acolst[col]; p < acolend[col]; p++) { - row = arow[p]; - firstcol[row] = SUPERLU_MIN(firstcol[row], col); - } - - /* Compute etree by Liu's algorithm for symmetric matrices, - except use (firstcol[r],c) in place of an edge (r,c) of A. - Thus each row clique in A'*A is replaced by a star - centered at its first vertex, which has the same fill. */ - - for (col = 0; col < nc; col++) { - cset = make_set (col); - root[cset] = col; - parent[col] = nc; /* Matlab */ - for (p = acolst[col]; p < acolend[col]; p++) { - row = firstcol[arow[p]]; - if (row >= col) continue; - rset = find (row); - rroot = root[rset]; - if (rroot != col) { - parent[rroot] = col; - cset = splink (cset, rset); - root[cset] = col; - } - } - } - - SUPERLU_FREE (root); - SUPERLU_FREE (firstcol); - finalize_disjoint_sets (); - return 0; -} - -/* - * q = TreePostorder (n, p); - * - * Postorder a tree. - * Input: - * p is a vector of parent pointers for a forest whose - * vertices are the integers 0 to n-1; p[root]==n. - * Output: - * q is a vector indexed by 0..n-1 such that q[i] is the - * i-th vertex in a postorder numbering of the tree. - * - * ( 2/7/95 modified by X.Li: - * q is a vector indexed by 0:n-1 such that vertex i is the - * q[i]-th vertex in a postorder numbering of the tree. - * That is, this is the inverse of the previous q. ) - * - * In the child structure, lower-numbered children are represented - * first, so that a tree which is already numbered in postorder - * will not have its order changed. - * - * Written by John Gilbert, Xerox, 10 Dec 1990. - * Based on code written by John Gilbert at CMI in 1987. - */ - -static int *first_kid, *next_kid; /* Linked list of children. */ -static int *post, postnum; - -static -/* - * Depth-first search from vertex v. - */ -void etdfs ( - int v - ) -{ - int w; - - for (w = first_kid[v]; w != -1; w = next_kid[w]) { - etdfs (w); - } - /* post[postnum++] = v; in Matlab */ - post[v] = postnum++; /* Modified by X.Li on 2/14/95 */ -} - - -/* - * Post order a tree - */ -int *TreePostorder( - int n, - int *parent -) -{ - int v, dad; - - /* Allocate storage for working arrays and results */ - first_kid = mxCallocInt (n+1); - next_kid = mxCallocInt (n+1); - post = mxCallocInt (n+1); - - /* Set up structure describing children */ - for (v = 0; v <= n; first_kid[v++] = -1); - for (v = n-1; v >= 0; v--) { - dad = parent[v]; - next_kid[v] = first_kid[dad]; - first_kid[dad] = v; - } - - /* Depth-first search from dummy root vertex #n */ - postnum = 0; - etdfs (n); - - SUPERLU_FREE (first_kid); - SUPERLU_FREE (next_kid); - return post; -} - - -/* - * p = spsymetree (A); - * - * Find the elimination tree for symmetric matrix A. - * This uses Liu's algorithm, and runs in time O(nz*log n). - * - * Input: - * Square sparse matrix A. No check is made for symmetry; - * elements below and on the diagonal are ignored. - * Numeric values are ignored, so any explicit zeros are - * treated as nonzero. - * Output: - * Integer array of parents representing the etree, with n - * meaning a root of the elimination forest. - * Note: - * This routine uses only the upper triangle, while sparse - * Cholesky (as in spchol.c) uses only the lower. Matlab's - * dense Cholesky uses only the upper. This routine could - * be modified to use the lower triangle either by transposing - * the matrix or by traversing it by rows with auxiliary - * pointer and link arrays. - * - * John R. Gilbert, Xerox, 10 Dec 1990 - * Based on code by JRG dated 1987, 1988, and 1990. - * Modified by X.S. Li, November 1999. - */ - -/* - * Symmetric elimination tree - */ -int -sp_symetree( - int *acolst, int *acolend, /* column starts and ends past 1 */ - int *arow, /* row indices of A */ - int n, /* dimension of A */ - int *parent /* parent in elim tree */ - ) -{ - int *root; /* root of subtree of etree */ - int rset, cset; - int row, col; - int rroot; - int p; - - root = mxCallocInt (n); - initialize_disjoint_sets (n); - - for (col = 0; col < n; col++) { - cset = make_set (col); - root[cset] = col; - parent[col] = n; /* Matlab */ - for (p = acolst[col]; p < acolend[col]; p++) { - row = arow[p]; - if (row >= col) continue; - rset = find (row); - rroot = root[rset]; - if (rroot != col) { - parent[rroot] = col; - cset = splink (cset, rset); - root[cset] = col; - } - } - } - SUPERLU_FREE (root); - finalize_disjoint_sets (); - return 0; -} /* SP_SYMETREE */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_ienv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_ienv.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_ienv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_ienv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,63 +0,0 @@ -/* - * File name: sp_ienv.c - * History: Modified from lapack routine ILAENV - */ -#include "slu_ddefs.h" - -int -sp_ienv(int ispec) -{ -/* - Purpose - ======= - - sp_ienv() is inquired to choose machine-dependent parameters for the - local environment. See ISPEC for a description of the parameters. - - This version provides a set of parameters which should give good, - but not optimal, performance on many of the currently available - computers. Users are encouraged to modify this subroutine to set - the tuning parameters for their particular machine using the option - and problem size information in the arguments. - - Arguments - ========= - - ISPEC (input) int - Specifies the parameter to be returned as the value of SP_IENV. - = 1: the panel size w; a panel consists of w consecutive - columns of matrix A in the process of Gaussian elimination. - The best value depends on machine's cache characters. - = 2: the relaxation parameter relax; if the number of - nodes (columns) in a subtree of the elimination tree is less - than relax, this subtree is considered as one supernode, - regardless of their row structures. - = 3: the maximum size for a supernode; - = 4: the minimum row dimension for 2-D blocking to be used; - = 5: the minimum column dimension for 2-D blocking to be used; - = 6: the estimated fills factor for L and U, compared with A; - - (SP_IENV) (output) int - >= 0: the value of the parameter specified by ISPEC - < 0: if SP_IENV = -k, the k-th argument had an illegal value. - - ===================================================================== -*/ - int i; - - switch (ispec) { - case 1: return (10); - case 2: return (5); - case 3: return (100); - case 4: return (200); - case 5: return (40); - case 6: return (20); - } - - /* Invalid value for ISPEC */ - i = 1; - superlu_xerbla("sp_ienv", &i); - return 0; - -} /* sp_ienv_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotgrowth.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotgrowth.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotgrowth.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,107 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include "slu_sdefs.h" - -float -sPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, - SuperMatrix *L, SuperMatrix *U) -{ -/* - * Purpose - * ======= - * - * Compute the reciprocal pivot growth factor of the leading ncols columns - * of the matrix, using the formula: - * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) - * - * Arguments - * ========= - * - * ncols (input) int - * The number of columns of matrices A, L and U. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = NC; Dtype = SLU_S; Mtype = GE. - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SC; Dtype = SLU_S; Mtype = TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = NC; - * Dtype = SLU_S; Mtype = TRU. - * - */ - NCformat *Astore; - SCformat *Lstore; - NCformat *Ustore; - float *Aval, *Lval, *Uval; - int fsupc, nsupr, luptr, nz_in_U; - int i, j, k, oldcol; - int *inv_perm_c; - float rpg, maxaj, maxuj; - extern double slamch_(char *); - float smlnum; - float *luval; - - /* Get machine constants. */ - smlnum = slamch_("S"); - rpg = 1. / smlnum; - - Astore = A->Store; - Lstore = L->Store; - Ustore = U->Store; - Aval = Astore->nzval; - Lval = Lstore->nzval; - Uval = Ustore->nzval; - - inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); - for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; - - for (k = 0; k <= Lstore->nsuper; ++k) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - luptr = L_NZ_START(fsupc); - luval = &Lval[luptr]; - nz_in_U = 1; - - for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { - maxaj = 0.; - oldcol = inv_perm_c[j]; - for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) - maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) ); - - maxuj = 0.; - for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) - maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) ); - - /* Supernode */ - for (i = 0; i < nz_in_U; ++i) - maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) ); - - ++nz_in_U; - luval += nsupr; - - if ( maxuj == 0. ) - rpg = SUPERLU_MIN( rpg, 1.); - else - rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); - } - - if ( j >= ncols ) break; - } - - SUPERLU_FREE(inv_perm_c); - return (rpg); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,173 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_sdefs.h" - -#undef DEBUG - -int -spivotL( - const int jcol, /* in */ - const float u, /* in - diagonal pivoting threshold */ - int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ - int *perm_r, /* may be modified */ - int *iperm_r, /* in - inverse of perm_r */ - int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ - int *pivrow, /* out */ - GlobalLU_t *Glu, /* modified - global LU data structures */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * Performs the numerical pivoting on the current column of L, - * and the CDIV operation. - * - * Pivot policy: - * (1) Compute thresh = u * max_(i>=j) abs(A_ij); - * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN - * pivot row = k; - * ELSE IF abs(A_jj) >= thresh THEN - * pivot row = j; - * ELSE - * pivot row = m; - * - * Note: If you absolutely want to use a given pivot order, then set u=0.0. - * - * Return value: 0 success; - * i > 0 U(i,i) is exactly zero. - * - */ - int fsupc; /* first column in the supernode */ - int nsupc; /* no of columns in the supernode */ - int nsupr; /* no of rows in the supernode */ - int lptr; /* points to the starting subscript of the supernode */ - int pivptr, old_pivptr, diag, diagind; - float pivmax, rtemp, thresh; - float temp; - float *lu_sup_ptr; - float *lu_col_ptr; - int *lsub_ptr; - int isub, icol, k, itemp; - int *lsub, *xlsub; - float *lusup; - int *xlusup; - flops_t *ops = stat->ops; - - /* Initialize pointers */ - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; - nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ - lptr = xlsub[fsupc]; - nsupr = xlsub[fsupc+1] - lptr; - lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ - lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ - lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ - -#ifdef DEBUG -if ( jcol == MIN_COL ) { - printf("Before cdiv: col %d\n", jcol); - for (k = nsupc; k < nsupr; k++) - printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); -} -#endif - - /* Determine the largest abs numerical value for partial pivoting; - Also search for user-specified pivot, and diagonal element. */ - if ( *usepr ) *pivrow = iperm_r[jcol]; - diagind = iperm_c[jcol]; - pivmax = 0.0; - pivptr = nsupc; - diag = EMPTY; - old_pivptr = nsupc; - for (isub = nsupc; isub < nsupr; ++isub) { - rtemp = fabs (lu_col_ptr[isub]); - if ( rtemp > pivmax ) { - pivmax = rtemp; - pivptr = isub; - } - if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; - if ( lsub_ptr[isub] == diagind ) diag = isub; - } - - /* Test for singularity */ - if ( pivmax == 0.0 ) { - *pivrow = lsub_ptr[pivptr]; - perm_r[*pivrow] = jcol; - *usepr = 0; - return (jcol+1); - } - - thresh = u * pivmax; - - /* Choose appropriate pivotal element by our policy. */ - if ( *usepr ) { - rtemp = fabs (lu_col_ptr[old_pivptr]); - if ( rtemp != 0.0 && rtemp >= thresh ) - pivptr = old_pivptr; - else - *usepr = 0; - } - if ( *usepr == 0 ) { - /* Use diagonal pivot? */ - if ( diag >= 0 ) { /* diagonal exists */ - rtemp = fabs (lu_col_ptr[diag]); - if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; - } - *pivrow = lsub_ptr[pivptr]; - } - - /* Record pivot row */ - perm_r[*pivrow] = jcol; - - /* Interchange row subscripts */ - if ( pivptr != nsupc ) { - itemp = lsub_ptr[pivptr]; - lsub_ptr[pivptr] = lsub_ptr[nsupc]; - lsub_ptr[nsupc] = itemp; - - /* Interchange numerical values as well, for the whole snode, such - * that L is indexed the same way as A. - */ - for (icol = 0; icol <= nsupc; icol++) { - itemp = pivptr + icol * nsupr; - temp = lu_sup_ptr[itemp]; - lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; - lu_sup_ptr[nsupc + icol*nsupr] = temp; - } - } /* if */ - - /* cdiv operation */ - ops[FACT] += nsupr - nsupc; - - temp = 1.0 / lu_col_ptr[nsupc]; - for (k = nsupc+1; k < nsupr; k++) - lu_col_ptr[k] *= temp; - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_preorder.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_preorder.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_preorder.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_preorder.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,204 +0,0 @@ -#include "slu_ddefs.h" - -void -sp_preorder(superlu_options_t *options, SuperMatrix *A, int *perm_c, - int *etree, SuperMatrix *AC) -{ -/* - * Purpose - * ======= - * - * sp_preorder() permutes the columns of the original matrix. It performs - * the following steps: - * - * 1. Apply column permutation perm_c[] to A's column pointers to form AC; - * - * 2. If options->Fact = DOFACT, then - * (1) Compute column elimination tree etree[] of AC'AC; - * (2) Post order etree[] to get a postordered elimination tree etree[], - * and a postorder permutation post[]; - * (3) Apply post[] permutation to columns of AC; - * (4) Overwrite perm_c[] with the product perm_c * post. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * Specifies whether or not the elimination tree will be re-used. - * If options->Fact == DOFACT, this means first time factor A, - * etree is computed, postered, and output. - * Otherwise, re-factor A, etree is input, unchanged on exit. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A can be: - * Stype = NC or SLU_NCP; Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * perm_c (input/output) int* - * Column permutation vector of size A->ncol, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * If options->Fact == DOFACT, perm_c is both input and output. - * On output, it is changed according to a postorder of etree. - * Otherwise, perm_c is input. - * - * etree (input/output) int* - * Elimination tree of Pc'*A'*A*Pc, dimension A->ncol. - * If options->Fact == DOFACT, etree is an output argument, - * otherwise it is an input argument. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * AC (output) SuperMatrix* - * The resulting matrix after applied the column permutation - * perm_c[] to matrix A. The type of AC can be: - * Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE. - * - */ - - NCformat *Astore; - NCPformat *ACstore; - int *iwork, *post; - register int n, i; - - n = A->ncol; - - /* Apply column permutation perm_c to A's column pointers so to - obtain NCP format in AC = A*Pc. */ - AC->Stype = SLU_NCP; - AC->Dtype = A->Dtype; - AC->Mtype = A->Mtype; - AC->nrow = A->nrow; - AC->ncol = A->ncol; - Astore = (NCformat*) A->Store; - AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) ); - ACstore = (NCPformat*) AC->Store; - if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore"); - ACstore->nnz = Astore->nnz; - ACstore->nzval = Astore->nzval; - ACstore->rowind = Astore->rowind; - ACstore->colbeg = (int*) SUPERLU_MALLOC(n*sizeof(int)); - if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg"); - ACstore->colend = (int*) SUPERLU_MALLOC(n*sizeof(int)); - if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend"); - -#ifdef DEBUG - print_int_vec("pre_order:", n, perm_c); - check_perm("Initial perm_c", n, perm_c); -#endif - - for (i = 0; i < n; i++) { - ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; - ACstore->colend[perm_c[i]] = Astore->colptr[i+1]; - } - - if ( options->Fact == DOFACT ) { -#undef ETREE_ATplusA -#ifdef ETREE_ATplusA - /*-------------------------------------------- - COMPUTE THE ETREE OF Pc*(A'+A)*Pc'. - --------------------------------------------*/ - int *b_colptr, *b_rowind, bnz, j; - int *c_colbeg, *c_colend; - - /*printf("Use etree(A'+A)\n");*/ - - /* Form B = A + A'. */ - at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, - &bnz, &b_colptr, &b_rowind); - - /* Form C = Pc*B*Pc'. */ - c_colbeg = (int*) SUPERLU_MALLOC(2*n*sizeof(int)); - c_colend = c_colbeg + n; - if (!c_colbeg ) ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend"); - for (i = 0; i < n; i++) { - c_colbeg[perm_c[i]] = b_colptr[i]; - c_colend[perm_c[i]] = b_colptr[i+1]; - } - for (j = 0; j < n; ++j) { - for (i = c_colbeg[j]; i < c_colend[j]; ++i) { - b_rowind[i] = perm_c[b_rowind[i]]; - } - } - - /* Compute etree of C. */ - sp_symetree(c_colbeg, c_colend, b_rowind, n, etree); - - SUPERLU_FREE(b_colptr); - if ( bnz ) SUPERLU_FREE(b_rowind); - SUPERLU_FREE(c_colbeg); - -#else - /*-------------------------------------------- - COMPUTE THE COLUMN ELIMINATION TREE. - --------------------------------------------*/ - sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind, - A->nrow, A->ncol, etree); -#endif -#ifdef DEBUG - print_int_vec("etree:", n, etree); -#endif - - /* In symmetric mode, do not do postorder here. */ - if ( options->SymmetricMode == NO ) { - /* Post order etree */ - post = (int *) TreePostorder(n, etree); - /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; - iwork = post; */ - -#ifdef DEBUG - print_int_vec("post:", n+1, post); - check_perm("post", n, post); -#endif - iwork = (int*) SUPERLU_MALLOC((n+1)*sizeof(int)); - if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]"); - - /* Renumber etree in postorder */ - for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]]; - for (i = 0; i < n; ++i) etree[i] = iwork[i]; - -#ifdef DEBUG - print_int_vec("postorder etree:", n, etree); -#endif - - /* Postmultiply A*Pc by post[] */ - for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i]; - for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i]; - for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i]; - for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i]; - - for (i = 0; i < n; ++i) - iwork[i] = post[perm_c[i]]; /* product of perm_c and post */ - for (i = 0; i < n; ++i) perm_c[i] = iwork[i]; - -#ifdef DEBUG - print_int_vec("Pc*post:", n, perm_c); - check_perm("final perm_c", n, perm_c); -#endif - SUPERLU_FREE (post); - SUPERLU_FREE (iwork); - } /* end postordering */ - - } /* if options->Fact == DOFACT ... */ - -} - -int check_perm(char *what, int n, int *perm) -{ - register int i; - int *marker; - marker = (int *) calloc(n, sizeof(int)); - - for (i = 0; i < n; ++i) { - if ( marker[perm[i]] == 1 || perm[i] >= n ) { - printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]); - ABORT("check_perm"); - } else { - marker[perm[i]] = 1; - } - } - - SUPERLU_FREE(marker); - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spruneL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spruneL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spruneL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,147 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - -void -spruneL( - const int jcol, /* in */ - const int *perm_r, /* in */ - const int pivrow, /* in */ - const int nseg, /* in */ - const int *segrep, /* in */ - const int *repfnz, /* in */ - int *xprune, /* out */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ -/* - * Purpose - * ======= - * Prunes the L-structure of supernodes whose L-structure - * contains the current pivot row "pivrow" - * - */ - float utemp; - int jsupno, irep, irep1, kmin, kmax, krow, movnum; - int i, ktemp, minloc, maxloc; - int do_prune; /* logical variable */ - int *xsup, *supno; - int *lsub, *xlsub; - float *lusup; - int *xlusup; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - /* - * For each supernode-rep irep in U[*,j] - */ - jsupno = supno[jcol]; - for (i = 0; i < nseg; i++) { - - irep = segrep[i]; - irep1 = irep + 1; - do_prune = FALSE; - - /* Don't prune with a zero U-segment */ - if ( repfnz[irep] == EMPTY ) - continue; - - /* If a snode overlaps with the next panel, then the U-segment - * is fragmented into two parts -- irep and irep1. We should let - * pruning occur at the rep-column in irep1's snode. - */ - if ( supno[irep] == supno[irep1] ) /* Don't prune */ - continue; - - /* - * If it has not been pruned & it has a nonz in row L[pivrow,i] - */ - if ( supno[irep] != jsupno ) { - if ( xprune[irep] >= xlsub[irep1] ) { - kmin = xlsub[irep]; - kmax = xlsub[irep1] - 1; - for (krow = kmin; krow <= kmax; krow++) - if ( lsub[krow] == pivrow ) { - do_prune = TRUE; - break; - } - } - - if ( do_prune ) { - - /* Do a quicksort-type partition - * movnum=TRUE means that the num values have to be exchanged. - */ - movnum = FALSE; - if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ - movnum = TRUE; - - while ( kmin <= kmax ) { - - if ( perm_r[lsub[kmax]] == EMPTY ) - kmax--; - else if ( perm_r[lsub[kmin]] != EMPTY ) - kmin++; - else { /* kmin below pivrow, and kmax above pivrow: - * interchange the two subscripts - */ - ktemp = lsub[kmin]; - lsub[kmin] = lsub[kmax]; - lsub[kmax] = ktemp; - - /* If the supernode has only one column, then we - * only keep one set of subscripts. For any subscript - * interchange performed, similar interchange must be - * done on the numerical values. - */ - if ( movnum ) { - minloc = xlusup[irep] + (kmin - xlsub[irep]); - maxloc = xlusup[irep] + (kmax - xlsub[irep]); - utemp = lusup[minloc]; - lusup[minloc] = lusup[maxloc]; - lusup[maxloc] = utemp; - } - - kmin++; - kmax--; - - } - - } /* while */ - - xprune[irep] = kmin; /* Pruning */ - -#ifdef CHK_PRUNE - printf(" After spruneL(),using col %d: xprune[%d] = %d\n", - jcol, irep, kmin); -#endif - } /* if do_prune */ - - } /* if */ - - } /* for each U-segment... */ -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sreadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sreadhb.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sreadhb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sreadhb.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,255 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include -#include "slu_sdefs.h" - - -/* Eat up the rest of the current line */ -int sDumpLine(FILE *fp) -{ - register int c; - while ((c = fgetc(fp)) != '\n') ; - return 0; -} - -int sParseIntFormat(char *buf, int *num, int *size) -{ - char *tmp; - - tmp = buf; - while (*tmp++ != '(') ; - sscanf(tmp, "%d", num); - while (*tmp != 'I' && *tmp != 'i') ++tmp; - ++tmp; - sscanf(tmp, "%d", size); - return 0; -} - -int sParseFloatFormat(char *buf, int *num, int *size) -{ - char *tmp, *period; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' - && *tmp != 'F' && *tmp != 'f') { - /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the - num picked up refers to P, which should be skipped. */ - if (*tmp=='p' || *tmp=='P') { - ++tmp; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - } else { - ++tmp; - } - } - ++tmp; - period = tmp; - while (*period != '.' && *period != ')') ++period ; - *period = '\0'; - *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ - - return 0; -} - -int sReadVector(FILE *fp, int n, int *where, int perline, int persize) -{ - register int i, j, item; - char tmp, buf[100]; - - i = 0; - while (i < n) { - fgets(buf, 100, fp); /* read a line at a time */ - for (j=0; jops; - - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - nextlu = xlusup[jcol]; - - /* - * Process the supernodal portion of L\U[*,j] - */ - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = 0; - ++nextlu; - } - - xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ - - if ( fsupc < jcol ) { - - luptr = xlusup[fsupc]; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nsupc = jcol - fsupc; /* Excluding jcol */ - ufirst = xlusup[jcol]; /* Points to the beginning of column - jcol in supernode L\U(jsupno). */ - nrow = nsupr - nsupc; - - ops[TRSV] += nsupc * (nsupc - 1); - ops[GEMV] += 2 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], &tempv[0] ); - - /* Scatter tempv[*] into lusup[*] */ - iptr = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - lusup[iptr++] -= tempv[i]; - tempv[i] = 0.0; - } -#endif - - } - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_sdefs.h" - -int -ssnode_dfs ( - const int jcol, /* in - start of the supernode */ - const int kcol, /* in - end of the supernode */ - const int *asub, /* in */ - const int *xa_begin, /* in */ - const int *xa_end, /* in */ - int *xprune, /* out */ - int *marker, /* modified */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* Purpose - * ======= - * ssnode_dfs() - Determine the union of the row structures of those - * columns within the relaxed snode. - * Note: The relaxed snodes are leaves of the supernodal etree, therefore, - * the portion outside the rectangular supernode must be zero. - * - * Return value - * ============ - * 0 success; - * >0 number of bytes allocated when run out of memory. - * - */ - register int i, k, ifrom, ito, nextl, new_next; - int nsuper, krow, kmark, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - int nzlmax; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - nsuper = ++supno[jcol]; /* Next available supernode number */ - nextl = xlsub[jcol]; - - for (i = jcol; i <= kcol; i++) { - /* For each nonzero in A[*,i] */ - for (k = xa_begin[i]; k < xa_end[i]; k++) { - krow = asub[k]; - kmark = marker[krow]; - if ( kmark != kcol ) { /* First time visit krow */ - marker[krow] = kcol; - lsub[nextl++] = krow; - if ( nextl >= nzlmax ) { - if ( mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - } - } - supno[i] = nsuper; - } - - /* Supernode > 1, then make a copy of the subscripts for pruning */ - if ( jcol < kcol ) { - new_next = nextl + (nextl - xlsub[jcol]); - while ( new_next > nzlmax ) { - if ( mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - ito = nextl; - for (ifrom = xlsub[jcol]; ifrom < nextl; ) - lsub[ito++] = lsub[ifrom++]; - for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; - nextl = ito; - } - - xsup[nsuper+1] = kcol + 1; - supno[kcol+1] = nsuper; - xprune[kcol] = nextl; - xlsub[kcol+1] = nextl; - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,470 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: ssp_blas2.c - * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. - */ - -#include "slu_sdefs.h" - -/* - * Function prototypes - */ -void susolve(int, int, float*, float*); -void slsolve(int, int, float*, float*); -void smatvec(int, int, int, float*, float*, float*); - - -int -sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, - SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * sp_strsv() solves one of the systems of equations - * A*x = b, or A'*x = b, - * where b and x are n element vectors and A is a sparse unit , or - * non-unit, upper or lower triangular matrix. - * No test for singularity or near-singularity is included in this - * routine. Such tests must be performed before calling this routine. - * - * Parameters - * ========== - * - * uplo - (input) char* - * On entry, uplo specifies whether the matrix is an upper or - * lower triangular matrix as follows: - * uplo = 'U' or 'u' A is an upper triangular matrix. - * uplo = 'L' or 'l' A is a lower triangular matrix. - * - * trans - (input) char* - * On entry, trans specifies the equations to be solved as - * follows: - * trans = 'N' or 'n' A*x = b. - * trans = 'T' or 't' A'*x = b. - * trans = 'C' or 'c' A'*x = b. - * - * diag - (input) char* - * On entry, diag specifies whether or not A is unit - * triangular as follows: - * diag = 'U' or 'u' A is assumed to be unit triangular. - * diag = 'N' or 'n' A is not assumed to be unit - * triangular. - * - * L - (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU. - * - * U - (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. - * U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU. - * - * x - (input/output) float* - * Before entry, the incremented array X must contain the n - * element right-hand side vector b. On exit, X is overwritten - * with the solution vector x. - * - * info - (output) int* - * If *info = -i, the i-th argument had an illegal value. - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - SCformat *Lstore; - NCformat *Ustore; - float *Lval, *Uval; - int incx = 1, incy = 1; - float alpha = 1.0, beta = 1.0; - int nrow; - int fsupc, nsupr, nsupc, luptr, istart, irow; - int i, k, iptr, jcol; - float *work; - flops_t solve_ops; - - /* Test the input parameters */ - *info = 0; - if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; - else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && - !lsame_(trans, "C")) *info = -2; - else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; - else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; - else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; - if ( *info ) { - i = -(*info); - xerbla_("sp_strsv", &i); - return 0; - } - - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( !(work = floatCalloc(L->nrow)) ) - ABORT("Malloc fails for work in sp_strsv()."); - - if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L)*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - nrow = nsupr - nsupc; - - solve_ops += nsupc * (nsupc - 1); - solve_ops += 2 * nrow * nsupc; - - if ( nsupc == 1 ) { - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { - irow = L_SUB(iptr); - ++luptr; - x[irow] -= x[fsupc] * Lval[luptr]; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#else - strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#endif -#else - slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); - - smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], - &x[fsupc], &work[0] ); -#endif - - iptr = istart + nsupc; - for (i = 0; i < nrow; ++i, ++iptr) { - irow = L_SUB(iptr); - x[irow] -= work[i]; /* Scatter */ - work[i] = 0.0; - - } - } - } /* for k ... */ - - } else { - /* Form x := inv(U)*x */ - - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - x[fsupc] /= Lval[luptr]; - for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { - irow = U_SUB(i); - x[irow] -= x[fsupc] * Uval[i]; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif -#else - susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); -#endif - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); - i++) { - irow = U_SUB(i); - x[irow] -= x[jcol] * Uval[i]; - } - } - } - } /* for k ... */ - - } - } else { /* Form x := inv(A')*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L')*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 2 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - x[jcol] -= x[irow] * Lval[i]; - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += nsupc * (nsupc - 1); -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := inv(U')*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - x[jcol] -= x[irow] * Uval[i]; - } - } - - solve_ops += nsupc * (nsupc + 1); - - if ( nsupc == 1 ) { - x[fsupc] /= Lval[luptr]; - } else { -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } - - stat->ops[SOLVE] += solve_ops; - SUPERLU_FREE(work); - return 0; -} - - - - -int -sp_sgemv(char *trans, float alpha, SuperMatrix *A, float *x, - int incx, float beta, float *y, int incy) -{ -/* Purpose - ======= - - sp_sgemv() performs one of the matrix-vector operations - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - where alpha and beta are scalars, x and y are vectors and A is a - sparse A->nrow by A->ncol matrix. - - Parameters - ========== - - TRANS - (input) char* - On entry, TRANS specifies the operation to be performed as - follows: - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - ALPHA - (input) float - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. - In the future, more general A can be handled. - - X - (input) float*, array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - - INCX - (input) int - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - - BETA - (input) float - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - - Y - (output) float*, array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - updated vector y. - - INCY - (input) int - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - - ==== Sparse Level 2 Blas routine. -*/ - - /* Local variables */ - NCformat *Astore; - float *Aval; - int info; - float temp; - int lenx, leny, i, j, irow; - int iy, jx, jy, kx, ky; - int notran; - - notran = lsame_(trans, "N"); - Astore = A->Store; - Aval = Astore->nzval; - - /* Test the input parameters */ - info = 0; - if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; - else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; - else if (incx == 0) info = 5; - else if (incy == 0) info = 8; - if (info != 0) { - xerbla_("sp_sgemv ", &info); - return 0; - } - - /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) - return 0; - - /* Set LENX and LENY, the lengths of the vectors x and y, and set - up the start points in X and Y. */ - if (lsame_(trans, "N")) { - lenx = A->ncol; - leny = A->nrow; - } else { - lenx = A->nrow; - leny = A->ncol; - } - if (incx > 0) kx = 0; - else kx = - (lenx - 1) * incx; - if (incy > 0) ky = 0; - else ky = - (leny - 1) * incy; - - /* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - /* First form y := beta*y. */ - if (beta != 1.) { - if (incy == 1) { - if (beta == 0.) - for (i = 0; i < leny; ++i) y[i] = 0.; - else - for (i = 0; i < leny; ++i) y[i] = beta * y[i]; - } else { - iy = ky; - if (beta == 0.) - for (i = 0; i < leny; ++i) { - y[iy] = 0.; - iy += incy; - } - else - for (i = 0; i < leny; ++i) { - y[iy] = beta * y[iy]; - iy += incy; - } - } - } - - if (alpha == 0.) return 0; - - if ( notran ) { - /* Form y := alpha*A*x + y. */ - jx = kx; - if (incy == 1) { - for (j = 0; j < A->ncol; ++j) { - if (x[jx] != 0.) { - temp = alpha * x[jx]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - y[irow] += temp * Aval[i]; - } - } - jx += incx; - } - } else { - ABORT("Not implemented."); - } - } else { - /* Form y := alpha*A'*x + y. */ - jy = ky; - if (incx == 1) { - for (j = 0; j < A->ncol; ++j) { - temp = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - temp += Aval[i] * x[irow]; - } - y[jy] += alpha * temp; - jy += incy; - } - } else { - ABORT("Not implemented."); - } - } - return 0; -} /* sp_sgemv */ - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas3.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: sp_blas3.c - * Purpose: Sparse BLAS3, using some dense BLAS3 operations. - */ - -#include "slu_sdefs.h" - -int -sp_sgemm(char *transa, char *transb, int m, int n, int k, - float alpha, SuperMatrix *A, float *b, int ldb, - float beta, float *c, int ldc) -{ -/* Purpose - ======= - - sp_s performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - (input) char* - On entry, TRANSA specifies the form of op( A ) to be used in - the matrix multiplication as follows: - TRANSA = 'N' or 'n', op( A ) = A. - TRANSA = 'T' or 't', op( A ) = A'. - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - Unchanged on exit. - - TRANSB - (input) char* - On entry, TRANSB specifies the form of op( B ) to be used in - the matrix multiplication as follows: - TRANSB = 'N' or 'n', op( B ) = B. - TRANSB = 'T' or 't', op( B ) = B'. - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - Unchanged on exit. - - M - (input) int - On entry, M specifies the number of rows of the matrix - op( A ) and of the matrix C. M must be at least zero. - Unchanged on exit. - - N - (input) int - On entry, N specifies the number of columns of the matrix - op( B ) and the number of columns of the matrix C. N must be - at least zero. - Unchanged on exit. - - K - (input) int - On entry, K specifies the number of columns of the matrix - op( A ) and the number of rows of the matrix op( B ). K must - be at least zero. - Unchanged on exit. - - ALPHA - (input) float - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. - In the future, more general A can be handled. - - B - FLOAT PRECISION array of DIMENSION ( LDB, kb ), where kb is - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - part of the array B must contain the matrix B, otherwise - the leading n by k part of the array B must contain the - matrix B. - Unchanged on exit. - - LDB - (input) int - On entry, LDB specifies the first dimension of B as declared - in the calling (sub) program. LDB must be at least max( 1, n ). - Unchanged on exit. - - BETA - (input) float - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then C need not be set on input. - - C - FLOAT PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - contain the matrix C, except when beta is zero, in which - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - ( alpha*op( A )*B + beta*C ). - - LDC - (input) int - On entry, LDC specifies the first dimension of C as declared - in the calling (sub)program. LDC must be at least max(1,m). - Unchanged on exit. - - ==== Sparse Level 3 Blas routine. -*/ - int incx = 1, incy = 1; - int j; - - for (j = 0; j < n; ++j) { - sp_sgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/superlu_timer.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/superlu_timer.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/superlu_timer.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/superlu_timer.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,56 +0,0 @@ -/* - * Purpose - * ======= - * Returns the time in seconds used by the process. - * - * Note: the timer function call is machine dependent. Use conditional - * compilation to choose the appropriate function. - * - */ - -#include "slu_ddefs.h" - -#ifdef SUN -/* - * It uses the system call gethrtime(3C), which is accurate to - * nanoseconds. -*/ -#include - -double SuperLU_timer_() { - return ( (double)gethrtime() / 1e9 ); -} - -#else - -#ifndef NO_TIMER -#include -#include -#ifndef WIN32 -#include -#include -#endif -#endif - -#ifndef CLK_TCK -#define CLK_TCK 60 -#endif - -double SuperLU_timer_() -{ -#ifdef NO_TIMER - /* no sys/times.h on WIN32 */ - double tmp; - tmp = 0.0; -#else - struct tms use; - double tmp; - times(&use); - tmp = use.tms_utime; - tmp += use.tms_stime; -#endif - return (double)(tmp) / CLK_TCK; -} - -#endif - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/supermatrix.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/supermatrix.h --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/supermatrix.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/supermatrix.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,140 +0,0 @@ -#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */ -#define __SUPERLU_SUPERMATRIX - -/******************************************** - * The matrix types are defined as follows. * - ********************************************/ -typedef enum { - SLU_NC, /* column-wise, no supernode */ - SLU_NR, /* row-wize, no supernode */ - SLU_SC, /* column-wise, supernode */ - SLU_SR, /* row-wise, supernode */ - SLU_NCP, /* column-wise, column-permuted, no supernode - (The consecutive columns of nonzeros, after permutation, - may not be stored contiguously.) */ - SLU_DN /* Fortran style column-wise storage for dense matrix */ -} Stype_t; - -typedef enum { - SLU_S, /* single */ - SLU_D, /* double */ - SLU_C, /* single complex */ - SLU_Z /* double complex */ -} Dtype_t; - -typedef enum { - SLU_GE, /* general */ - SLU_TRLU, /* lower triangular, unit diagonal */ - SLU_TRUU, /* upper triangular, unit diagonal */ - SLU_TRL, /* lower triangular */ - SLU_TRU, /* upper triangular */ - SLU_SYL, /* symmetric, store lower half */ - SLU_SYU, /* symmetric, store upper half */ - SLU_HEL, /* Hermitian, store lower half */ - SLU_HEU /* Hermitian, store upper half */ -} Mtype_t; - -typedef struct { - Stype_t Stype; /* Storage type: interprets the storage structure - pointed to by *Store. */ - Dtype_t Dtype; /* Data type. */ - Mtype_t Mtype; /* Matrix type: describes the mathematical property of - the matrix. */ - int_t nrow; /* number of rows */ - int_t ncol; /* number of columns */ - void *Store; /* pointer to the actual storage of the matrix */ -} SuperMatrix; - -/*********************************************** - * The storage schemes are defined as follows. * - ***********************************************/ - -/* Stype == NC (Also known as Harwell-Boeing sparse matrix format) */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *rowind; /* pointer to array of row indices of the nonzeros */ - int_t *colptr; /* pointer to array of beginning of columns in nzval[] - and rowind[] */ - /* Note: - Zero-based indexing is used; - colptr[] has ncol+1 entries, the last one pointing - beyond the last column, so that colptr[ncol] = nnz. */ -} NCformat; - -/* Stype == NR (Also known as row compressed storage (RCS). */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by row */ - int_t *colind; /* pointer to array of column indices of the nonzeros */ - int_t *rowptr; /* pointer to array of beginning of rows in nzval[] - and colind[] */ - /* Note: - Zero-based indexing is used; - nzval[] and colind[] are of the same length, nnz; - rowptr[] has nrow+1 entries, the last one pointing - beyond the last column, so that rowptr[nrow] = nnz. */ -} NRformat; - -/* Stype == SC */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - int_t nsuper; /* number of supernodes, minus 1 */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */ - int_t *rowind; /* pointer to array of compressed row indices of - rectangular supernodes */ - int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */ - int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column - j belongs; mapping from column to supernode number. */ - int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th - supernode; mapping from supernode number to column. - e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) - sup_to_col: 0 1 2 4 7 12 (nsuper=4) */ - /* Note: - Zero-based indexing is used; - nzval_colptr[], rowind_colptr[], col_to_sup and - sup_to_col[] have ncol+1 entries, the last one - pointing beyond the last column. - For col_to_sup[], only the first ncol entries are - defined. For sup_to_col[], only the first nsuper+2 - entries are defined. */ -} SCformat; - -/* Stype == NCP */ -typedef struct { - int_t nnz; /* number of nonzeros in the matrix */ - void *nzval; /* pointer to array of nonzero values, packed by column */ - int_t *rowind;/* pointer to array of row indices of the nonzeros */ - /* Note: nzval[]/rowind[] always have the same length */ - int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] - and rowind[] */ - int_t *colend;/* colend[j] points to one past the last element of column - j in nzval[] and rowind[] */ - /* Note: - Zero-based indexing is used; - The consecutive columns of the nonzeros may not be - contiguous in storage, because the matrix has been - postmultiplied by a column permutation matrix. */ -} NCPformat; - -/* Stype == DN */ -typedef struct { - int_t lda; /* leading dimension */ - void *nzval; /* array of size lda*ncol to represent a dense matrix */ -} DNformat; - - - -/********************************************************* - * Macros used for easy access of sparse matrix entries. * - *********************************************************/ -#define L_SUB_START(col) ( Lstore->rowind_colptr[col] ) -#define L_SUB(ptr) ( Lstore->rowind[ptr] ) -#define L_NZ_START(col) ( Lstore->nzval_colptr[col] ) -#define L_FST_SUPC(superno) ( Lstore->sup_to_col[superno] ) -#define U_NZ_START(col) ( Ustore->colptr[col] ) -#define U_SUB(ptr) ( Ustore->rowind[ptr] ) - - -#endif /* __SUPERLU_SUPERMATRIX */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sutil.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,481 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include "slu_sdefs.h" - -void -sCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, - float *nzval, int *rowind, int *colptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NCformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->rowind = rowind; - Astore->colptr = colptr; -} - -void -sCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, - float *nzval, int *colind, int *rowptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NRformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->colind = colind; - Astore->rowptr = rowptr; -} - -/* Copy matrix A into matrix B. */ -void -sCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore, *Bstore; - int ncol, nnz, i; - - B->Stype = A->Stype; - B->Dtype = A->Dtype; - B->Mtype = A->Mtype; - B->nrow = A->nrow;; - B->ncol = ncol = A->ncol; - Astore = (NCformat *) A->Store; - Bstore = (NCformat *) B->Store; - Bstore->nnz = nnz = Astore->nnz; - for (i = 0; i < nnz; ++i) - ((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i]; - for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; - for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; -} - - -void -sCreate_Dense_Matrix(SuperMatrix *X, int m, int n, float *x, int ldx, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - DNformat *Xstore; - - X->Stype = stype; - X->Dtype = dtype; - X->Mtype = mtype; - X->nrow = m; - X->ncol = n; - X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); - Xstore = (DNformat *) X->Store; - Xstore->lda = ldx; - Xstore->nzval = (float *) x; -} - -void -sCopy_Dense_Matrix(int M, int N, float *X, int ldx, - float *Y, int ldy) -{ -/* - * - * Purpose - * ======= - * - * Copies a two-dimensional matrix X to another matrix Y. - */ - int i, j; - - for (j = 0; j < N; ++j) - for (i = 0; i < M; ++i) - Y[i + j*ldy] = X[i + j*ldx]; -} - -void -sCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, - float *nzval, int *nzval_colptr, int *rowind, - int *rowind_colptr, int *col_to_sup, int *sup_to_col, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - SCformat *Lstore; - - L->Stype = stype; - L->Dtype = dtype; - L->Mtype = mtype; - L->nrow = m; - L->ncol = n; - L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); - if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); - Lstore = L->Store; - Lstore->nnz = nnz; - Lstore->nsuper = col_to_sup[n]; - Lstore->nzval = nzval; - Lstore->nzval_colptr = nzval_colptr; - Lstore->rowind = rowind; - Lstore->rowind_colptr = rowind_colptr; - Lstore->col_to_sup = col_to_sup; - Lstore->sup_to_col = sup_to_col; - -} - - -/* - * Convert a row compressed storage into a column compressed storage. - */ -void -sCompRow_to_CompCol(int m, int n, int nnz, - float *a, int *colind, int *rowptr, - float **at, int **rowind, int **colptr) -{ - register int i, j, col, relpos; - int *marker; - - /* Allocate storage for another copy of the matrix. */ - *at = (float *) floatMalloc(nnz); - *rowind = (int *) intMalloc(nnz); - *colptr = (int *) intMalloc(n+1); - marker = (int *) intCalloc(n); - - /* Get counts of each column of A, and set up column pointers */ - for (i = 0; i < m; ++i) - for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; - (*colptr)[0] = 0; - for (j = 0; j < n; ++j) { - (*colptr)[j+1] = (*colptr)[j] + marker[j]; - marker[j] = (*colptr)[j]; - } - - /* Transfer the matrix into the compressed column storage. */ - for (i = 0; i < m; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - col = colind[j]; - relpos = marker[col]; - (*rowind)[relpos] = i; - (*at)[relpos] = a[j]; - ++marker[col]; - } - } - - SUPERLU_FREE(marker); -} - - -void -sPrint_CompCol_Matrix(char *what, SuperMatrix *A) -{ - NCformat *Astore; - register int i,n; - float *dp; - - printf("\nCompCol matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (NCformat *) A->Store; - dp = (float *) Astore->nzval; - printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); - printf("nzval: "); - for (i = 0; i < Astore->colptr[n]; ++i) printf("%f ", dp[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); - printf("\ncolptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); - printf("\n"); - fflush(stdout); -} - -void -sPrint_SuperNode_Matrix(char *what, SuperMatrix *A) -{ - SCformat *Astore; - register int i, j, k, c, d, n, nsup; - float *dp; - int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; - - printf("\nSuperNode matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (SCformat *) A->Store; - dp = (float *) Astore->nzval; - col_to_sup = Astore->col_to_sup; - sup_to_col = Astore->sup_to_col; - rowind_colptr = Astore->rowind_colptr; - rowind = Astore->rowind; - printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", - A->nrow,A->ncol,Astore->nnz,Astore->nsuper); - printf("nzval:\n"); - for (k = 0; k <= Astore->nsuper; ++k) { - c = sup_to_col[k]; - nsup = sup_to_col[k+1] - c; - for (j = c; j < c + nsup; ++j) { - d = Astore->nzval_colptr[j]; - for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { - printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]); - } - } - } -#if 0 - for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); -#endif - printf("\nnzval_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->rowind_colptr[n]; ++i) - printf("%d ", Astore->rowind[i]); - printf("\nrowind_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); - printf("\ncol_to_sup: "); - for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); - printf("\nsup_to_col: "); - for (i = 0; i <= Astore->nsuper+1; ++i) - printf("%d ", sup_to_col[i]); - printf("\n"); - fflush(stdout); -} - -void -sPrint_Dense_Matrix(char *what, SuperMatrix *A) -{ - DNformat *Astore; - register int i, j, lda = Astore->lda; - float *dp; - - printf("\nDense matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (DNformat *) A->Store; - dp = (float *) Astore->nzval; - printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda); - printf("\nnzval: "); - for (j = 0; j < A->ncol; ++j) { - for (i = 0; i < A->nrow; ++i) printf("%f ", dp[i + j*lda]); - printf("\n"); - } - printf("\n"); - fflush(stdout); -} - -/* - * Diagnostic print of column "jcol" in the U/L factor. - */ -void -sprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) -{ - int i, k, fsupc; - int *xsup, *supno; - int *xlsub, *lsub; - float *lusup; - int *xlusup; - float *ucol; - int *usub, *xusub; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - - printf("%s", msg); - printf("col %d: pivrow %d, supno %d, xprune %d\n", - jcol, pivrow, supno[jcol], xprune[jcol]); - - printf("\tU-col:\n"); - for (i = xusub[jcol]; i < xusub[jcol+1]; i++) - printf("\t%d%10.4f\n", usub[i], ucol[i]); - printf("\tL-col in rectangular snode:\n"); - fsupc = xsup[supno[jcol]]; /* first col of the snode */ - i = xlsub[fsupc]; - k = xlusup[jcol]; - while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { - printf("\t%d\t%10.4f\n", lsub[i], lusup[k]); - i++; k++; - } - fflush(stdout); -} - - -/* - * Check whether tempv[] == 0. This should be true before and after - * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". - */ -void scheck_tempv(int n, float *tempv) -{ - int i; - - for (i = 0; i < n; i++) { - if (tempv[i] != 0.0) - { - fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]); - ABORT("scheck_tempv"); - } - } -} - - -void -sGenXtrue(int n, int nrhs, float *x, int ldx) -{ - int i, j; - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) { - x[i + j*ldx] = 1.0;/* + (float)(i+1.)/n;*/ - } -} - -/* - * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's - */ -void -sFillRHS(trans_t trans, int nrhs, float *x, int ldx, - SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore; - float *Aval; - DNformat *Bstore; - float *rhs; - float one = 1.0; - float zero = 0.0; - int ldc; - char transc[1]; - - Astore = A->Store; - Aval = (float *) Astore->nzval; - Bstore = B->Store; - rhs = Bstore->nzval; - ldc = Bstore->lda; - - if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; - else *(unsigned char *)transc = 'T'; - - sp_sgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A, - x, ldx, zero, rhs, ldc); - -} - -/* - * Fills a float precision array with a given value. - */ -void -sfill(float *a, int alen, float dval) -{ - register int i; - for (i = 0; i < alen; i++) a[i] = dval; -} - - - -/* - * Check the inf-norm of the error vector - */ -void sinf_norm_error(int nrhs, SuperMatrix *X, float *xtrue) -{ - DNformat *Xstore; - float err, xnorm; - float *Xmat, *soln_work; - int i, j; - - Xstore = X->Store; - Xmat = Xstore->nzval; - - for (j = 0; j < nrhs; j++) { - soln_work = &Xmat[j*Xstore->lda]; - err = xnorm = 0.0; - for (i = 0; i < X->nrow; i++) { - err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i])); - xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i])); - } - err = err / xnorm; - printf("||X - Xtrue||/||X|| = %e\n", err); - } -} - - - -/* Print performance of the code. */ -void -sPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, - float rpg, float rcond, float *ferr, - float *berr, char *equed, SuperLUStat_t *stat) -{ - SCformat *Lstore; - NCformat *Ustore; - double *utime; - flops_t *ops; - - utime = stat->utime; - ops = stat->ops; - - if ( utime[FACT] != 0. ) - printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], - ops[FACT]*1e-6/utime[FACT]); - printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); - if ( utime[SOLVE] != 0. ) - printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], - ops[SOLVE]*1e-6/utime[SOLVE]); - - Lstore = (SCformat *) L->Store; - Ustore = (NCformat *) U->Store; - printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); - printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); - printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); - - printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n", - mem_usage->for_lu/1e6, mem_usage->total_needed/1e6, - mem_usage->expansions); - - printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); - printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", - utime[FACT], ops[FACT]*1e-6/utime[FACT], - utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], - utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); - - printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); - printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", - rpg, rcond, ferr[0], berr[0], equed); - -} - - - - -print_float_vec(char *what, int n, float *vec) -{ - int i; - printf("%s: n %d\n", what, n); - for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]); - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/xerbla.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/xerbla.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/xerbla.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/xerbla.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,43 +0,0 @@ -#include -#include "slu_Cnames.h" - -/* Subroutine */ int xerbla_(char *srname, int *info) -{ -/* -- LAPACK auxiliary routine (version 2.0) -- - Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., - Courant Institute, Argonne National Lab, and Rice University - September 30, 1994 - - - Purpose - ======= - - XERBLA is an error handler for the LAPACK routines. - It is called by an LAPACK routine if an input parameter has an - invalid value. A message is printed and execution stops. - - Installers may consider modifying the STOP statement in order to - call system-specific exception-handling facilities. - - Arguments - ========= - - SRNAME (input) CHARACTER*6 - The name of the routine which called XERBLA. - - INFO (input) INT - The position of the invalid parameter in the parameter list - - of the calling routine. - - ===================================================================== -*/ - - printf("** On entry to %6s, parameter number %2d had an illegal value\n", - srname, *info); - -/* End of XERBLA */ - - return 0; -} /* xerbla_ */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,363 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_zdefs.h" - -/* - * Function prototypes - */ -void zusolve(int, int, doublecomplex*, doublecomplex*); -void zlsolve(int, int, doublecomplex*, doublecomplex*); -void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); - - - -/* Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -zcolumn_bmod ( - const int jcol, /* in */ - const int nseg, /* in */ - doublecomplex *dense, /* in */ - doublecomplex *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in */ - int fpanelc, /* in -- first column in the current panel */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose: - * ======== - * Performs numeric block updates (sup-col) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - doublecomplex alpha, beta; - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in supernode - * nsupr = no of rows in supernode (used as leading dimension) - * luptr = location of supernodal LU-block in storage - * kfnz = first nonz in the k-th supernodal segment - * no_zeros = no of leading zeros in a supernodal U-segment - */ - doublecomplex ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int fsupc, nsupc, nsupr, segsze; - int nrow; /* No of rows in the matrix of matrix-vector */ - int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; - register int lptr, kfnz, isub, irow, i; - register int no_zeros, new_next; - int ufirst, nextlu; - int fst_col; /* First column within small LU update */ - int d_fsupc; /* Distance between the first column of the current - panel and the first column of the current snode. */ - int *xsup, *supno; - int *lsub, *xlsub; - doublecomplex *lusup; - int *xlusup; - int nzlumax; - doublecomplex *tempv1; - doublecomplex zero = {0.0, 0.0}; - doublecomplex one = {1.0, 0.0}; - doublecomplex none = {-1.0, 0.0}; - doublecomplex comp_temp, comp_temp1; - int mem_error; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - nzlumax = Glu->nzlumax; - jcolp1 = jcol + 1; - jsupno = supno[jcol]; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - - krep = segrep[k]; - k--; - ksupno = supno[krep]; - if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ - - fsupc = xsup[ksupno]; - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - /* Distance from the current supernode to the current panel; - d_fsupc=0 if fsupc > fpanelc. */ - d_fsupc = fst_col - fsupc; - - luptr = xlusup[fst_col] + d_fsupc; - lptr = xlsub[fsupc] + d_fsupc; - - kfnz = repfnz[krep]; - kfnz = SUPERLU_MAX ( kfnz, fpanelc ); - - segsze = krep - kfnz + 1; - nsupc = krep - fst_col + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nrow = nsupr - d_fsupc - nsupc; - krep_ind = lptr + nsupc - 1; - - ops[TRSV] += 4 * segsze * (segsze - 1); - ops[GEMV] += 8 * nrow * segsze; - - - - /* - * Case 1: Update U-segment of size 1 -- col-col update - */ - if ( segsze == 1 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - z_sub(&dense[irow], &dense[irow], &comp_temp); - luptr++; - } - - } else if ( segsze <= 3 ) { - ukj = dense[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { /* Case 2: 2cols-col update */ - zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); - z_sub(&ukj, &ukj, &comp_temp); - dense[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&dense[irow], &dense[irow], &comp_temp); - } - } else { /* Case 3: 3cols-col update */ - ukj2 = dense[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); - z_sub(&ukj1, &ukj1, &comp_temp); - - zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); - zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&ukj, &ukj, &comp_temp); - - dense[lsub[krep_ind]] = ukj; - dense[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; - luptr1++; - luptr2++; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&dense[irow], &dense[irow], &comp_temp); - } - } - - - } else { - /* - * Case: sup-col update - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense - */ - - no_zeros = kfnz - fst_col; - - /* Copy U[*,j] segment from dense[*] to tempv[*] */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - tempv[i] = dense[irow]; - ++isub; - } - - /* Dense triangular solve -- start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#endif - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - zmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); -#endif - - - /* Scatter tempv[] into SPA dense[] as a temporary storage */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense[irow] = tempv[i]; - tempv[i] = zero; - ++isub; - } - - /* Scatter tempv1[] into SPA dense[] */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - z_sub(&dense[irow], &dense[irow], &tempv1[i]); - tempv1[i] = zero; - ++isub; - } - } - - } /* if jsupno ... */ - - } /* for each segment... */ - - /* - * Process the supernodal portion of L\U[*,j] - */ - nextlu = xlusup[jcol]; - fsupc = xsup[jsupno]; - - /* Copy the SPA dense into L\U[*,j] */ - new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; - while ( new_next > nzlumax ) { - if (mem_error = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) - return (mem_error); - lusup = Glu->lusup; - lsub = Glu->lsub; - } - - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = zero; - ++nextlu; - } - - xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ - - /* For more updates within the panel (also within the current supernode), - * should start from the first column of the panel, or the first column - * of the supernode, whichever is bigger. There are 2 cases: - * 1) fsupc < fpanelc, then fst_col := fpanelc - * 2) fsupc >= fpanelc, then fst_col := fsupc - */ - fst_col = SUPERLU_MAX ( fsupc, fpanelc ); - - if ( fst_col < jcol ) { - - /* Distance between the current supernode and the current panel. - d_fsupc=0 if fsupc >= fpanelc. */ - d_fsupc = fst_col - fsupc; - - lptr = xlsub[fsupc] + d_fsupc; - luptr = xlusup[fst_col] + d_fsupc; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ - nsupc = jcol - fst_col; /* Excluding jcol */ - nrow = nsupr - d_fsupc - nsupc; - - /* Points to the beginning of jcol in snode L\U(jsupno) */ - ufirst = xlusup[jcol] + d_fsupc; - - ops[TRSV] += 4 * nsupc * (nsupc - 1); - ops[GEMV] += 8 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#else - ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], - &nsupr, &lusup[ufirst], &incx ); -#endif - - alpha = none; beta = one; /* y := beta*y + alpha*A*x */ - -#ifdef _CRAY - CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - - zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], tempv ); - - /* Copy updates from tempv[*] into lusup[*] */ - isub = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - z_sub(&lusup[isub], &lusup[isub], &tempv[i]); - tempv[i] = zero; - ++isub; - } - -#endif - - - } /* if fst_col < jcol ... */ - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,269 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - -/* What type of supernodes we want */ -#define T2_SUPER - -int -zcolumn_dfs( - const int m, /* in - number of rows in the matrix */ - const int jcol, /* in */ - int *perm_r, /* in */ - int *nseg, /* modified - with new segments appended */ - int *lsub_col, /* in - defines the RHS vector to start the dfs */ - int *segrep, /* modified - with new segments appended */ - int *repfnz, /* modified */ - int *xprune, /* modified */ - int *marker, /* modified */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * "column_dfs" performs a symbolic factorization on column jcol, and - * decide the supernode boundary. - * - * This routine does not use numeric values, but only use the RHS - * row indices to start the dfs. - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. The routine returns a list of such supernodal - * representatives in topological order of the dfs that generates them. - * The location of the first nonzero in each such supernodal segment - * (supernodal entry location) is also returned. - * - * Local parameters - * ================ - * nseg: no of segments in current U[*,j] - * jsuper: jsuper=EMPTY if column j does not belong to the same - * supernode as j-1. Otherwise, jsuper=nsuper. - * - * marker2: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - * Return value - * ============ - * 0 success; - * > 0 number of bytes allocated when run out of space. - * - */ - int jcolp1, jcolm1, jsuper, nsuper, nextl; - int k, krep, krow, kmark, kperm; - int *marker2; /* Used for small panel LU */ - int fsupc; /* First column of a snode */ - int myfnz; /* First nonz column of a U-segment */ - int chperm, chmark, chrep, kchild; - int xdfs, maxdfs, kpar, oldrep; - int jptr, jm1ptr; - int ito, ifrom, istop; /* Used to compress row subscripts */ - int mem_error; - int *xsup, *supno, *lsub, *xlsub; - int nzlmax; - static int first = 1, maxsuper; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - if ( first ) { - maxsuper = sp_ienv(3); - first = 0; - } - jcolp1 = jcol + 1; - jcolm1 = jcol - 1; - nsuper = supno[jcol]; - jsuper = nsuper; - nextl = xlsub[jcol]; - marker2 = &marker[2*m]; - - - /* For each nonzero in A[*,jcol] do dfs */ - for (k = 0; lsub_col[k] != EMPTY; k++) { - - krow = lsub_col[k]; - lsub_col[k] = EMPTY; - kmark = marker2[krow]; - - /* krow was visited before, go to the next nonz */ - if ( kmark == jcol ) continue; - - /* For each unmarked nbr krow of jcol - * krow is in L: place it in structure of L[*,jcol] - */ - marker2[krow] = jcol; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - lsub[nextl++] = krow; /* krow is indexed into A */ - if ( nextl >= nzlmax ) { - if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ - } else { - /* krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz[krep]; - - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > kperm ) repfnz[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker2[kchild]; - - if ( chmark != jcol ) { /* Not reached yet */ - marker2[kchild] = jcol; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,k] */ - if ( chperm == EMPTY ) { - lsub[nextl++] = kchild; - if ( nextl >= nzlmax ) { - if ( mem_error = - zLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - if ( chmark != jcolm1 ) jsuper = EMPTY; - } else { - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz[chrep]; - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz[chrep] = chperm; - } else { - /* Continue dfs at super-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L^t) */ - parent[krep] = oldrep; - repfnz[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - } /* else */ - - } /* else */ - - } /* if */ - - } /* while */ - - /* krow has no more unexplored nbrs; - * place supernode-rep krep in postorder DFS. - * backtrack dfs to its parent - */ - segrep[*nseg] = krep; - ++(*nseg); - kpar = parent[krep]; /* Pop from stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - - } while ( kpar != EMPTY ); /* Until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonzero ... */ - - /* Check to see if j belongs in the same supernode as j-1 */ - if ( jcol == 0 ) { /* Do nothing for column 0 */ - nsuper = supno[0] = 0; - } else { - fsupc = xsup[nsuper]; - jptr = xlsub[jcol]; /* Not compressed yet */ - jm1ptr = xlsub[jcolm1]; - -#ifdef T2_SUPER - if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; -#endif - /* Make sure the number of columns in a supernode doesn't - exceed threshold. */ - if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; - - /* If jcol starts a new supernode, reclaim storage space in - * lsub from the previous supernode. Note we only store - * the subscript set of the first and last columns of - * a supernode. (first for num values, last for pruning) - */ - if ( jsuper == EMPTY ) { /* starts a new supernode */ - if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ -#ifdef CHK_COMPRESS - printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); -#endif - ito = xlsub[fsupc+1]; - xlsub[jcolm1] = ito; - istop = ito + jptr - jm1ptr; - xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ - xlsub[jcol] = istop; - for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) - lsub[ito] = lsub[ifrom]; - nextl = ito; /* = istop + length(jcol) */ - } - nsuper++; - supno[jcol] = nsuper; - } /* if a new supernode */ - - } /* else: jcol > 0 */ - - /* Tidy up the pointers before exit */ - xsup[nsuper+1] = jcolp1; - supno[jcolp1] = nsuper; - xprune[jcol] = nextl; /* Initialize upper bound for pruning */ - xlsub[jcolp1] = nextl; - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,103 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - -int -zcopy_to_ucol( - int jcol, /* in */ - int nseg, /* in */ - int *segrep, /* in */ - int *repfnz, /* in */ - int *perm_r, /* in */ - doublecomplex *dense, /* modified - reset to zero on return */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Gather from SPA dense[*] to global ucol[*]. - */ - int ksub, krep, ksupno; - int i, k, kfnz, segsze; - int fsupc, isub, irow; - int jsupno, nextu; - int new_next, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - doublecomplex *ucol; - int *usub, *xusub; - int nzumax; - - doublecomplex zero = {0.0, 0.0}; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - nzumax = Glu->nzumax; - - jsupno = supno[jcol]; - nextu = xusub[jcol]; - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { - krep = segrep[k--]; - ksupno = supno[krep]; - - if ( ksupno != jsupno ) { /* Should go into ucol[] */ - kfnz = repfnz[krep]; - if ( kfnz != EMPTY ) { /* Nonzero U-segment */ - - fsupc = xsup[ksupno]; - isub = xlsub[fsupc] + kfnz - fsupc; - segsze = krep - kfnz + 1; - - new_next = nextu + segsze; - while ( new_next > nzumax ) { - if (mem_error = zLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)) - return (mem_error); - ucol = Glu->ucol; - if (mem_error = zLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)) - return (mem_error); - usub = Glu->usub; - lsub = Glu->lsub; - } - - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - usub[nextu] = perm_r[irow]; - ucol[nextu] = dense[irow]; - dense[irow] = zero; - nextu++; - isub++; - } - - } - - } - - } /* for each segment... */ - - xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgscon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgscon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgscon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,143 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: zgscon.c - * History: Modified from lapack routines ZGECON. - */ -#include -#include "slu_zdefs.h" - -void -zgscon(char *norm, SuperMatrix *L, SuperMatrix *U, - double anorm, double *rcond, SuperLUStat_t *stat, int *info) -{ -/* - Purpose - ======= - - ZGSCON estimates the reciprocal of the condition number of a general - real matrix A, in either the 1-norm or the infinity-norm, using - the LU factorization computed by ZGETRF. - - An estimate is obtained for norm(inv(A)), and the reciprocal of the - condition number is computed as - RCOND = 1 / ( norm(A) * norm(inv(A)) ). - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - NORM (input) char* - Specifies whether the 1-norm condition number or the - infinity-norm condition number is required: - = '1' or 'O': 1-norm; - = 'I': Infinity-norm. - - L (input) SuperMatrix* - The factor L from the factorization Pr*A*Pc=L*U as computed by - zgstrf(). Use compressed row subscripts storage for supernodes, - i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. - - U (input) SuperMatrix* - The factor U from the factorization Pr*A*Pc=L*U as computed by - zgstrf(). Use column-wise storage scheme, i.e., U has types: - Stype = SLU_NC, Dtype = SLU_Z, Mtype = TRU. - - ANORM (input) double - If NORM = '1' or 'O', the 1-norm of the original matrix A. - If NORM = 'I', the infinity-norm of the original matrix A. - - RCOND (output) double* - The reciprocal of the condition number of the matrix A, - computed as RCOND = 1/(norm(A) * norm(inv(A))). - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - - ===================================================================== -*/ - - /* Local variables */ - int kase, kase1, onenrm, i; - double ainvnm; - doublecomplex *work; - extern int zrscl_(int *, doublecomplex *, doublecomplex *, int *); - - extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); - - - /* Test the input parameters. */ - *info = 0; - onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); - if (! onenrm && ! lsame_(norm, "I")) *info = -1; - else if (L->nrow < 0 || L->nrow != L->ncol || - L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU) - *info = -2; - else if (U->nrow < 0 || U->nrow != U->ncol || - U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU) - *info = -3; - if (*info != 0) { - i = -(*info); - xerbla_("zgscon", &i); - return; - } - - /* Quick return if possible */ - *rcond = 0.; - if ( L->nrow == 0 || U->nrow == 0) { - *rcond = 1.; - return; - } - - work = doublecomplexCalloc( 3*L->nrow ); - - - if ( !work ) - ABORT("Malloc fails for work arrays in zgscon."); - - /* Estimate the norm of inv(A). */ - ainvnm = 0.; - if ( onenrm ) kase1 = 1; - else kase1 = 2; - kase = 0; - - do { - zlacon_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase); - - if (kase == 0) break; - - if (kase == kase1) { - /* Multiply by inv(L). */ - sp_ztrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); - - /* Multiply by inv(U). */ - sp_ztrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); - - } else { - - /* Multiply by inv(U'). */ - sp_ztrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); - - /* Multiply by inv(L'). */ - sp_ztrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); - - } - - } while ( kase != 0 ); - - /* Compute the estimate of the reciprocal condition number. */ - if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; - - SUPERLU_FREE (work); - return; - -} /* zgscon */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsequ.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsequ.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsequ.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,184 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: zgsequ.c - * History: Modified from LAPACK routine ZGEEQU - */ -#include -#include "slu_zdefs.h" - -void -zgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int *info) -{ -/* - Purpose - ======= - - ZGSEQU computes row and column scalings intended to equilibrate an - M-by-N sparse matrix A and reduce its condition number. R returns the row - scale factors and C the column scale factors, chosen to try to make - the largest element in each row and column of the matrix B with - elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. - - R(i) and C(j) are restricted to be between SMLNUM = smallest safe - number and BIGNUM = largest safe number. Use of these scaling - factors is not guaranteed to reduce the condition number of A but - works well in practice. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input) SuperMatrix* - The matrix of dimension (A->nrow, A->ncol) whose equilibration - factors are to be computed. The type of A can be: - Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. - - R (output) double*, size A->nrow - If INFO = 0 or INFO > M, R contains the row scale factors - for A. - - C (output) double*, size A->ncol - If INFO = 0, C contains the column scale factors for A. - - ROWCND (output) double* - If INFO = 0 or INFO > M, ROWCND contains the ratio of the - smallest R(i) to the largest R(i). If ROWCND >= 0.1 and - AMAX is neither too large nor too small, it is not worth - scaling by R. - - COLCND (output) double* - If INFO = 0, COLCND contains the ratio of the smallest - C(i) to the largest C(i). If COLCND >= 0.1, it is not - worth scaling by C. - - AMAX (output) double* - Absolute value of largest matrix element. If AMAX is very - close to overflow or very close to underflow, the matrix - should be scaled. - - INFO (output) int* - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, and i is - <= A->nrow: the i-th row of A is exactly zero - > A->ncol: the (i-M)-th column of A is exactly zero - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int i, j, irow; - double rcmin, rcmax; - double bignum, smlnum; - extern double dlamch_(char *); - - /* Test the input parameters. */ - *info = 0; - if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - xerbla_("zgsequ", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - - /* Get machine constants. */ - smlnum = dlamch_("S"); - bignum = 1. / smlnum; - - /* Compute row scale factors. */ - for (i = 0; i < A->nrow; ++i) r[i] = 0.; - - /* Find the maximum element in each row. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[i]) ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (i = 0; i < A->nrow; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); - } - *amax = rcmax; - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - /* Compute column scale factors */ - for (j = 0; j < A->ncol; ++j) c[j] = 0.; - - /* Find the maximum element in each column, assuming the row - scalings computed above. */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - c[j] = SUPERLU_MAX( c[j], z_abs1(&Aval[i]) * r[irow] ); - } - - /* Find the maximum and minimum scale factors. */ - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); - } - - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); - } - - return; - -} /* zgsequ */ - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsrfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsrfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsrfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,447 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: zgsrfs.c - * History: Modified from lapack routine ZGERFS - */ -#include -#include "slu_zdefs.h" - -void -zgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, char *equed, double *R, double *C, - SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * ZGSRFS improves the computed solution to a system of linear - * equations and provides error bounds and backward error estimates for - * the solution. - * - * If equilibration was performed, the system becomes: - * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * A (input) SuperMatrix* - * The original matrix A in the system, or the scaled A if - * equilibration was done. The type of A can be: - * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_GE. - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * zgstrf(). Use column-wise storage scheme, - * i.e., U has types: Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (A->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * equed (input) Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by - * diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * - * R (input) double*, dimension (A->nrow) - * The row scale factors for A. - * If equed = 'R' or 'B', A is premultiplied by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * - * C (input) double*, dimension (A->ncol) - * The column scale factors for A. - * If equed = 'C' or 'B', A is postmultiplied by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * - * B (input) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. - * The right hand side matrix B. - * if equed = 'R' or 'B', B is premultiplied by diag(R). - * - * X (input/output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. - * On entry, the solution matrix X, as computed by zgstrs(). - * On exit, the improved solution matrix X. - * if *equed = 'C' or 'B', X should be premultiplied by diag(C) - * in order to obtain the solution to the original system. - * - * FERR (output) double*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * - * BERR (output) double*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if INFO = -i, the i-th argument had an illegal value - * - * Internal Parameters - * =================== - * - * ITMAX is the maximum number of steps of iterative refinement. - * - */ - -#define ITMAX 5 - - /* Table of constant values */ - int ione = 1; - doublecomplex ndone = {-1., 0.}; - doublecomplex done = {1., 0.}; - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - SuperMatrix Bjcol; - DNformat *Bstore, *Xstore, *Bjcol_store; - doublecomplex *Bmat, *Xmat, *Bptr, *Xptr; - int kase; - double safe1, safe2; - int i, j, k, irow, nz, count, notran, rowequ, colequ; - int ldb, ldx, nrhs; - double s, xk, lstres, eps, safmin; - char transc[1]; - trans_t transt; - doublecomplex *work; - double *rwork; - int *iwork; - extern double dlamch_(char *); - extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); -#ifdef _CRAY - extern int CCOPY(int *, doublecomplex *, int *, doublecomplex *, int *); - extern int CSAXPY(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); -#else - extern int zcopy_(int *, doublecomplex *, int *, doublecomplex *, int *); - extern int zaxpy_(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); -#endif - - Astore = A->Store; - Aval = Astore->nzval; - Bstore = B->Store; - Xstore = X->Store; - Bmat = Bstore->nzval; - Xmat = Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - /* Test the input parameters */ - *info = 0; - notran = (trans == NOTRANS); - if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) - *info = -3; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) - *info = -4; - else if ( ldb < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) - *info = -10; - else if ( ldx < SUPERLU_MAX(0, A->nrow) || - X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) - *info = -11; - if (*info != 0) { - i = -(*info); - xerbla_("zgsrfs", &i); - return; - } - - /* Quick return if possible */ - if ( A->nrow == 0 || nrhs == 0) { - for (j = 0; j < nrhs; ++j) { - ferr[j] = 0.; - berr[j] = 0.; - } - return; - } - - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - - /* Allocate working space */ - work = doublecomplexMalloc(2*A->nrow); - rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); - iwork = intMalloc(A->nrow); - if ( !work || !rwork || !iwork ) - ABORT("Malloc fails for work/rwork/iwork."); - - if ( notran ) { - *(unsigned char *)transc = 'N'; - transt = TRANS; - } else { - *(unsigned char *)transc = 'T'; - transt = NOTRANS; - } - - /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ - nz = A->ncol + 1; - eps = dlamch_("Epsilon"); - safmin = dlamch_("Safe minimum"); - safe1 = nz * safmin; - safe2 = safe1 / eps; - - /* Compute the number of nonzeros in each row (or column) of A */ - for (i = 0; i < A->nrow; ++i) iwork[i] = 0; - if ( notran ) { - for (k = 0; k < A->ncol; ++k) - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - ++iwork[Astore->rowind[i]]; - } else { - for (k = 0; k < A->ncol; ++k) - iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; - } - - /* Copy one column of RHS B into Bjcol. */ - Bjcol.Stype = B->Stype; - Bjcol.Dtype = B->Dtype; - Bjcol.Mtype = B->Mtype; - Bjcol.nrow = B->nrow; - Bjcol.ncol = 1; - Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); - Bjcol_store = Bjcol.Store; - Bjcol_store->lda = ldb; - Bjcol_store->nzval = work; /* address aliasing */ - - /* Do for each right hand side ... */ - for (j = 0; j < nrhs; ++j) { - count = 0; - lstres = 3.; - Bptr = &Bmat[j*ldb]; - Xptr = &Xmat[j*ldx]; - - while (1) { /* Loop until stopping criterion is satisfied. */ - - /* Compute residual R = B - op(A) * X, - where op(A) = A, A**T, or A**H, depending on TRANS. */ - -#ifdef _CRAY - CCOPY(&A->nrow, Bptr, &ione, work, &ione); -#else - zcopy_(&A->nrow, Bptr, &ione, work, &ione); -#endif - sp_zgemv(transc, ndone, A, Xptr, ione, done, work, ione); - - /* Compute componentwise relative backward error from formula - max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) - where abs(Z) is the componentwise absolute value of the matrix - or vector Z. If the i-th component of the denominator is less - than SAFE2, then SAFE1 is added to the i-th component of the - numerator and denominator before dividing. */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if (notran) { - for (k = 0; k < A->ncol; ++k) { - xk = z_abs1( &Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - s += z_abs1(&Aval[i]) * z_abs1(&Xptr[irow]); - } - rwork[k] += s; - } - } - s = 0.; - for (i = 0; i < A->nrow; ++i) { - if (rwork[i] > safe2) - s = SUPERLU_MAX( s, z_abs1(&work[i]) / rwork[i] ); - else - s = SUPERLU_MAX( s, (z_abs1(&work[i]) + safe1) / - (rwork[i] + safe1) ); - } - berr[j] = s; - - /* Test stopping criterion. Continue iterating if - 1) The residual BERR(J) is larger than machine epsilon, and - 2) BERR(J) decreased by at least a factor of 2 during the - last iteration, and - 3) At most ITMAX iterations tried. */ - - if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { - /* Update solution and try again. */ - zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - -#ifdef _CRAY - CAXPY(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#else - zaxpy_(&A->nrow, &done, work, &ione, - &Xmat[j*ldx], &ione); -#endif - lstres = berr[j]; - ++count; - } else { - break; - } - - } /* end while */ - - stat->RefineSteps = count; - - /* Bound error from formula: - norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* - ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) - where - norm(Z) is the magnitude of the largest component of Z - inv(op(A)) is the inverse of op(A) - abs(Z) is the componentwise absolute value of the matrix or - vector Z - NZ is the maximum number of nonzeros in any row of A, plus 1 - EPS is machine epsilon - - The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) - is incremented by SAFE1 if the i-th component of - abs(op(A))*abs(X) + abs(B) is less than SAFE2. - - Use ZLACON to estimate the infinity-norm of the matrix - inv(op(A)) * diag(W), - where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ - - for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); - - /* Compute abs(op(A))*abs(X) + abs(B). */ - if ( notran ) { - for (k = 0; k < A->ncol; ++k) { - xk = z_abs1( &Xptr[k] ); - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) - rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; - } - } else { - for (k = 0; k < A->ncol; ++k) { - s = 0.; - for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { - irow = Astore->rowind[i]; - xk = z_abs1( &Xptr[irow] ); - s += z_abs1(&Aval[i]) * xk; - } - rwork[k] += s; - } - } - - for (i = 0; i < A->nrow; ++i) - if (rwork[i] > safe2) - rwork[i] = z_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; - else - rwork[i] = z_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; - kase = 0; - - do { - zlacon_(&A->nrow, &work[A->nrow], work, - &ferr[j], &kase); - if (kase == 0) break; - - if (kase == 1) { - /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) { - zd_mult(&work[i], &work[i], C[i]); - } - else if ( !notran && rowequ ) - for (i = 0; i < A->nrow; ++i) { - zd_mult(&work[i], &work[i], R[i]); - } - - zgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); - - for (i = 0; i < A->nrow; ++i) { - zd_mult(&work[i], &work[i], rwork[i]); - } - } else { - /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ - for (i = 0; i < A->nrow; ++i) { - zd_mult(&work[i], &work[i], rwork[i]); - } - - zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); - - if ( notran && colequ ) - for (i = 0; i < A->ncol; ++i) { - zd_mult(&work[i], &work[i], C[i]); - } - else if ( !notran && rowequ ) - for (i = 0; i < A->ncol; ++i) { - zd_mult(&work[i], &work[i], R[i]); - } - } - - } while ( kase != 0 ); - - /* Normalize error. */ - lstres = 0.; - if ( notran && colequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, C[i] * z_abs1( &Xptr[i]) ); - } else if ( !notran && rowequ ) { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, R[i] * z_abs1( &Xptr[i]) ); - } else { - for (i = 0; i < A->nrow; ++i) - lstres = SUPERLU_MAX( lstres, z_abs1( &Xptr[i]) ); - } - if ( lstres != 0. ) - ferr[j] /= lstres; - - } /* for each RHS j ... */ - - SUPERLU_FREE(work); - SUPERLU_FREE(rwork); - SUPERLU_FREE(iwork); - SUPERLU_FREE(Bjcol.Store); - - return; - -} /* zgsrfs */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssv.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,221 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_zdefs.h" - -void -zgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, - SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * ZGSSV solves the system of linear equations A*X=B, using the - * LU factorization from ZGSTRF. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. Permute the columns of A, forming A*Pc, where Pc - * is a permutation matrix. For more details of this step, - * see sp_preorder.c. - * - * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined - * by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 1.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the - * above algorithm to the transpose of A: - * - * 2.1. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix. - * For more details of this step, see sp_preorder.c. - * - * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr - * determined by Gaussian elimination with partial pivoting. - * L is unit lower triangular with offdiagonal entries - * bounded by 1 in magnitude, and U is upper triangular. - * - * 2.3. Solve the system of equations A*X=B using the factored - * form of A. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR; Dtype = SLU_Z; Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, column permutation vector of size A->ncol - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * If A->Stype = SLU_NR, column permutation vector of size A->nrow - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or - * options->Fact = SamePattern_SameRowPerm, it is an input argument. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * Otherwise, it is an output argument. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->RowPerm = MY_PERMR or - * options->Fact = SamePattern_SameRowPerm, perm_r is an - * input argument. - * otherwise it is an output argument. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * so the solution could not be computed. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - DNformat *Bstore; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int lwork = 0, *etree, i; - - /* Set default values for some parameters */ - double drop_tol = 0.; - int panel_size; /* panel size */ - int relax; /* no of columns in a relaxed snodes */ - int permc_spec; - trans_t trans = NOTRANS; - double *utime; - double t; /* Temporary time */ - - /* Test the input parameters ... */ - *info = 0; - Bstore = B->Store; - if ( options->Fact != DOFACT ) *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) - *info = -7; - if ( *info != 0 ) { - i = -(*info); - xerbla_("zgssv", &i); - return; - } - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - trans = TRANS; - } else { - if ( A->Stype == SLU_NC ) AA = A; - } - - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t; - - etree = intMalloc(A->ncol); - - t = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t; - - panel_size = sp_ienv(1); - relax = sp_ienv(2); - - /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4));*/ - t = SuperLU_timer_(); - /* Compute the LU factorization of A. */ - zgstrf(options, &AC, drop_tol, relax, panel_size, - etree, NULL, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t; - - t = SuperLU_timer_(); - if ( *info == 0 ) { - /* Solve the system A*X=B, overwriting B with X. */ - zgstrs (trans, L, U, perm_c, perm_r, B, stat, info); - } - utime[SOLVE] = SuperLU_timer_() - t; - - SUPERLU_FREE (etree); - Destroy_CompCol_Permuted(&AC); - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssvx.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssvx.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssvx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,614 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_zdefs.h" - -void -zgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, - int *etree, char *equed, double *R, double *C, - SuperMatrix *L, SuperMatrix *U, void *work, int lwork, - SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, - double *rcond, double *ferr, double *berr, - mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) -{ -/* - * Purpose - * ======= - * - * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using - * the LU factorization from zgstrf(). Error bounds on the solution and - * a condition estimate are also provided. It performs the following steps: - * - * 1. If A is stored column-wise (A->Stype = SLU_NC): - * - * 1.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A is - * overwritten by diag(R)*A*diag(C) and B by diag(R)*B - * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans - * = TRANS or CONJ). - * - * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation - * matrix that usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 1.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the matrix A (after equilibration if options->Equil = YES) - * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. - * - * 1.4. Compute the reciprocal pivot growth factor. - * - * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form of - * A is used to estimate the condition number of the matrix A. If - * the reciprocal of the condition number is less than machine - * precision, info = A->ncol+1 is returned as a warning, but the - * routine still goes on to solve for X and computes error bounds - * as described below. - * - * 1.6. The system of equations is solved for X using the factored form - * of A. - * - * 1.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 1.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm - * to the transpose of A: - * - * 2.1. If options->Equil = YES, scaling factors are computed to - * equilibrate the system: - * options->Trans = NOTRANS: - * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B - * options->Trans = TRANS: - * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B - * options->Trans = CONJ: - * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B - * Whether or not the system will be equilibrated depends on the - * scaling of the matrix A, but if equilibration is used, A' is - * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B - * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). - * - * 2.2. Permute columns of transpose(A) (rows of A), - * forming transpose(A)*Pc, where Pc is a permutation matrix that - * usually preserves sparsity. - * For more details of this step, see sp_preorder.c. - * - * 2.3. If options->Fact != FACTORED, the LU decomposition is used to - * factor the transpose(A) (after equilibration if - * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the - * permutation Pr determined by partial pivoting. - * - * 2.4. Compute the reciprocal pivot growth factor. - * - * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the - * routine returns with info = i. Otherwise, the factored form - * of transpose(A) is used to estimate the condition number of the - * matrix A. If the reciprocal of the condition number - * is less than machine precision, info = A->nrow+1 is returned as - * a warning, but the routine still goes on to solve for X and - * computes error bounds as described below. - * - * 2.6. The system of equations is solved for X using the factored form - * of transpose(A). - * - * 2.7. If options->IterRefine != NOREFINE, iterative refinement is - * applied to improve the computed solution matrix and calculate - * error bounds and backward error estimates for it. - * - * 2.8. If equilibration was used, the matrix X is premultiplied by - * diag(C) (if options->Trans = NOTRANS) or diag(R) - * (if options->Trans = TRANS or CONJ) so that it solves the - * original system before equilibration. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed and how the - * system will be solved. - * - * A (input/output) SuperMatrix* - * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number - * of the linear equations is A->nrow. Currently, the type of A can be: - * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE. - * In the future, more general A may be handled. - * - * On entry, If options->Fact = FACTORED and equed is not 'N', - * then A must have been equilibrated by the scaling factors in - * R and/or C. - * On exit, A is not modified if options->Equil = NO, or if - * options->Equil = YES but equed = 'N' on exit. - * Otherwise, if options->Equil = YES and equed is not 'N', - * A is scaled as follows: - * If A->Stype = SLU_NC: - * equed = 'R': A := diag(R) * A - * equed = 'C': A := A * diag(C) - * equed = 'B': A := diag(R) * A * diag(C). - * If A->Stype = SLU_NR: - * equed = 'R': transpose(A) := diag(R) * transpose(A) - * equed = 'C': transpose(A) := transpose(A) * diag(C) - * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). - * - * perm_c (input/output) int* - * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, - * which defines the permutation matrix Pc; perm_c[i] = j means - * column i of A is in position j in A*Pc. - * On exit, perm_c may be overwritten by the product of the input - * perm_c and a permutation that postorders the elimination tree - * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree - * is already in postorder. - * - * If A->Stype = SLU_NR, column permutation vector of size A->nrow, - * which describes permutation of columns of transpose(A) - * (rows of A) as described above. - * - * perm_r (input/output) int* - * If A->Stype = SLU_NC, row permutation vector of size A->nrow, - * which defines the permutation matrix Pr, and is determined - * by partial pivoting. perm_r[i] = j means row i of A is in - * position j in Pr*A. - * - * If A->Stype = SLU_NR, permutation vector of size A->ncol, which - * determines permutation of rows of transpose(A) - * (columns of A) as described above. - * - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by a - * new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument. - * - * etree (input/output) int*, dimension (A->ncol) - * Elimination tree of Pc'*A'*A*Pc. - * If options->Fact != FACTORED and options->Fact != DOFACT, - * etree is an input argument, otherwise it is an output argument. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * - * equed (input/output) char* - * Specifies the form of equilibration that was done. - * = 'N': No equilibration. - * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). - * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). - * = 'B': Both row and column equilibration, i.e., A was replaced - * by diag(R)*A*diag(C). - * If options->Fact = FACTORED, equed is an input argument, - * otherwise it is an output argument. - * - * R (input/output) double*, dimension (A->nrow) - * The row scale factors for A or transpose(A). - * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). - * If equed = 'N' or 'C', R is not accessed. - * If options->Fact = FACTORED, R is an input argument, - * otherwise, R is output. - * If options->zFact = FACTORED and equed = 'R' or 'B', each element - * of R must be positive. - * - * C (input/output) double*, dimension (A->ncol) - * The column scale factors for A or transpose(A). - * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) - * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). - * If equed = 'N' or 'R', C is not accessed. - * If options->Fact = FACTORED, C is an input argument, - * otherwise, C is output. - * If options->Fact = FACTORED and equed = 'C' or 'B', each element - * of C must be positive. - * - * L (output) SuperMatrix* - * The factor L from the factorization - * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses compressed row subscripts storage for supernodes, i.e., - * L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization - * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or - * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). - * Uses column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. - * - * work (workspace/output) void*, size (lwork) (in bytes) - * User supplied workspace, should be large enough - * to hold data structures for factors L and U. - * On exit, if fact is not 'F', L and U point to this array. - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * mem_usage->total_needed; no other side effects. - * - * See argument 'mem_usage' for memory usage statistics. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * If B->ncol = 0, only LU decomposition is performed, the triangular - * solve is skipped. - * On exit, - * if equed = 'N', B is not modified; otherwise - * if A->Stype = SLU_NC: - * if options->Trans = NOTRANS and equed = 'R' or 'B', - * B is overwritten by diag(R)*B; - * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', - * B is overwritten by diag(C)*B; - * if A->Stype = SLU_NR: - * if options->Trans = NOTRANS and equed = 'C' or 'B', - * B is overwritten by diag(C)*B; - * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', - * B is overwritten by diag(R)*B. - * - * X (output) SuperMatrix* - * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. - * If info = 0 or info = A->ncol+1, X contains the solution matrix - * to the original system of equations. Note that A and B are modified - * on exit if equed is not 'N', and the solution to the equilibrated - * system is inv(diag(C))*X if options->Trans = NOTRANS and - * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' - * and equed = 'R' or 'B'. - * - * recip_pivot_growth (output) double* - * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). - * The infinity norm is used. If recip_pivot_growth is much less - * than 1, the stability of the LU factorization could be poor. - * - * rcond (output) double* - * The estimate of the reciprocal condition number of the matrix A - * after equilibration (if done). If rcond is less than the machine - * precision (in particular, if rcond = 0), the matrix is singular - * to working precision. This condition is indicated by a return - * code of info > 0. - * - * FERR (output) double*, dimension (B->ncol) - * The estimated forward error bound for each solution vector - * X(j) (the j-th column of the solution matrix X). - * If XTRUE is the true solution corresponding to X(j), FERR(j) - * is an estimated upper bound for the magnitude of the largest - * element in (X(j) - XTRUE) divided by the magnitude of the - * largest element in X(j). The estimate is as reliable as - * the estimate for RCOND, and is almost always a slight - * overestimate of the true error. - * If options->IterRefine = NOREFINE, ferr = 1.0. - * - * BERR (output) double*, dimension (B->ncol) - * The componentwise relative backward error of each solution - * vector X(j) (i.e., the smallest relative change in - * any element of A or B that makes X(j) an exact solution). - * If options->IterRefine = NOREFINE, berr = 1.0. - * - * mem_usage (output) mem_usage_t* - * Record the memory usage statistics, consisting of following fields: - * - for_lu (float) - * The amount of space used in bytes for L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * The number of memory expansions during the LU factorization. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly - * singular, so the solution and error bounds - * could not be computed. - * = A->ncol+1: U is nonsingular, but RCOND is less than machine - * precision, meaning that the matrix is singular to - * working precision. Nevertheless, the solution and - * error bounds are computed because there are a number - * of situations where the computed solution can be more - * accurate than the value of RCOND would suggest. - * > A->ncol+1: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. - * - */ - - DNformat *Bstore, *Xstore; - doublecomplex *Bmat, *Xmat; - int ldb, ldx, nrhs; - SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ - SuperMatrix AC; /* Matrix postmultiplied by Pc */ - int colequ, equil, nofact, notran, rowequ, permc_spec; - trans_t trant; - char norm[1]; - int i, j, info1; - double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; - int relax, panel_size; - double diag_pivot_thresh, drop_tol; - double t0; /* temporary time */ - double *utime; - - /* External functions */ - extern double zlangs(char *, SuperMatrix *); - extern double dlamch_(char *); - - Bstore = B->Store; - Xstore = X->Store; - Bmat = Bstore->nzval; - Xmat = Xstore->nzval; - ldb = Bstore->lda; - ldx = Xstore->lda; - nrhs = B->ncol; - - *info = 0; - nofact = (options->Fact != FACTORED); - equil = (options->Equil == YES); - notran = (options->Trans == NOTRANS); - if ( nofact ) { - *(unsigned char *)equed = 'N'; - rowequ = FALSE; - colequ = FALSE; - } else { - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - smlnum = dlamch_("Safe minimum"); - bignum = 1. / smlnum; - } - -#if 0 -printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", - options->Fact, options->Trans, *equed); -#endif - - /* Test the input parameters */ - if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && - options->Fact != SamePattern_SameRowPerm && - !notran && options->Trans != TRANS && options->Trans != CONJ && - !equil && options->Equil != NO) - *info = -1; - else if ( A->nrow != A->ncol || A->nrow < 0 || - (A->Stype != SLU_NC && A->Stype != SLU_NR) || - A->Dtype != SLU_Z || A->Mtype != SLU_GE ) - *info = -2; - else if (options->Fact == FACTORED && - !(rowequ || colequ || lsame_(equed, "N"))) - *info = -6; - else { - if (rowequ) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, R[j]); - rcmax = SUPERLU_MAX(rcmax, R[j]); - } - if (rcmin <= 0.) *info = -7; - else if ( A->nrow > 0) - rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else rowcnd = 1.; - } - if (colequ && *info == 0) { - rcmin = bignum; - rcmax = 0.; - for (j = 0; j < A->nrow; ++j) { - rcmin = SUPERLU_MIN(rcmin, C[j]); - rcmax = SUPERLU_MAX(rcmax, C[j]); - } - if (rcmin <= 0.) *info = -8; - else if (A->nrow > 0) - colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); - else colcnd = 1.; - } - if (*info == 0) { - if ( lwork < -1 ) *info = -12; - else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_Z || - B->Mtype != SLU_GE ) - *info = -13; - else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || - (B->ncol != 0 && B->ncol != X->ncol) || - X->Stype != SLU_DN || - X->Dtype != SLU_Z || X->Mtype != SLU_GE ) - *info = -14; - } - } - if (*info != 0) { - i = -(*info); - xerbla_("zgssvx", &i); - return; - } - - /* Initialization for factor parameters */ - panel_size = sp_ienv(1); - relax = sp_ienv(2); - diag_pivot_thresh = options->DiagPivotThresh; - drop_tol = 0.0; - - utime = stat->utime; - - /* Convert A to SLU_NC format when necessary. */ - if ( A->Stype == SLU_NR ) { - NRformat *Astore = A->Store; - AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); - zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, - Astore->nzval, Astore->colind, Astore->rowptr, - SLU_NC, A->Dtype, A->Mtype); - if ( notran ) { /* Reverse the transpose argument. */ - trant = TRANS; - notran = 0; - } else { - trant = NOTRANS; - notran = 1; - } - } else { /* A->Stype == SLU_NC */ - trant = options->Trans; - AA = A; - } - - if ( nofact && equil ) { - t0 = SuperLU_timer_(); - /* Compute row and column scalings to equilibrate the matrix A. */ - zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); - - if ( info1 == 0 ) { - /* Equilibrate matrix A. */ - zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); - rowequ = lsame_(equed, "R") || lsame_(equed, "B"); - colequ = lsame_(equed, "C") || lsame_(equed, "B"); - } - utime[EQUIL] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Scale the right hand side if equilibration was performed. */ - if ( notran ) { - if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); - } - } - } else if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); - } - } - } - - if ( nofact ) { - - t0 = SuperLU_timer_(); - /* - * Gnet column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = COLAMD: approximate minimum degree column ordering - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) - get_perm_c(permc_spec, AA, perm_c); - utime[COLPERM] = SuperLU_timer_() - t0; - - t0 = SuperLU_timer_(); - sp_preorder(options, AA, perm_c, etree, &AC); - utime[ETREE] = SuperLU_timer_() - t0; - -/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", - relax, panel_size, sp_ienv(3), sp_ienv(4)); - fflush(stdout); */ - - /* Compute the LU factorization of A*Pc. */ - t0 = SuperLU_timer_(); - zgstrf(options, &AC, drop_tol, relax, panel_size, - etree, work, lwork, perm_c, perm_r, L, U, stat, info); - utime[FACT] = SuperLU_timer_() - t0; - - if ( lwork == -1 ) { - mem_usage->total_needed = *info - A->ncol; - return; - } - } - - if ( options->PivotGrowth ) { - if ( *info > 0 ) { - if ( *info <= A->ncol ) { - /* Compute the reciprocal pivot growth factor of the leading - rank-deficient *info columns of A. */ - *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); - } - return; - } - - /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ - *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); - } - - if ( options->ConditionNumber ) { - /* Estimate the reciprocal of the condition number of A. */ - t0 = SuperLU_timer_(); - if ( notran ) { - *(unsigned char *)norm = '1'; - } else { - *(unsigned char *)norm = 'I'; - } - anorm = zlangs(norm, AA); - zgscon(norm, L, U, anorm, rcond, stat, info); - utime[RCOND] = SuperLU_timer_() - t0; - } - - if ( nrhs > 0 ) { - /* Compute the solution matrix X. */ - for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ - for (i = 0; i < B->nrow; i++) - Xmat[i + j*ldx] = Bmat[i + j*ldb]; - - t0 = SuperLU_timer_(); - zgstrs (trant, L, U, perm_c, perm_r, X, stat, info); - utime[SOLVE] = SuperLU_timer_() - t0; - - /* Use iterative refinement to improve the computed solution and compute - error bounds and backward error estimates for it. */ - t0 = SuperLU_timer_(); - if ( options->IterRefine != NOREFINE ) { - zgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, - X, ferr, berr, stat, info); - } else { - for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; - } - utime[REFINE] = SuperLU_timer_() - t0; - - /* Transform the solution matrix X to a solution of the original system. */ - if ( notran ) { - if ( colequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); - } - } - } else if ( rowequ ) { - for (j = 0; j < nrhs; ++j) - for (i = 0; i < A->nrow; ++i) { - zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); - } - } - } /* end if nrhs > 0 */ - - if ( options->ConditionNumber ) { - /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ - if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; - } - - if ( nofact ) { - zQuerySpace(L, U, mem_usage); - Destroy_CompCol_Permuted(&AC); - } - if ( A->Stype == SLU_NR ) { - Destroy_SuperMatrix_Store(AA); - SUPERLU_FREE(AA); - } - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrf.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,433 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - -void -zgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol, - int relax, int panel_size, int *etree, void *work, int lwork, - int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * ZGSTRF computes an LU factorization of a general sparse m-by-n - * matrix A using partial pivoting with row interchanges. - * The factorization has the form - * Pr * A = L * U - * where Pr is a row permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper - * triangular (upper trapezoidal if A->nrow < A->ncol). - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * options (input) superlu_options_t* - * The structure defines the input parameters to control - * how the LU decomposition will be performed. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. - * - * drop_tol (input) double (NOT IMPLEMENTED) - * Drop tolerance parameter. At step j of the Gaussian elimination, - * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. - * 0 <= drop_tol <= 1. The default value of drop_tol is 0. - * - * relax (input) int - * To control degree of relaxing supernodes. If the number - * of nodes (columns) in a subtree of the elimination tree is less - * than relax, this subtree is considered as one supernode, - * regardless of the row structures of those columns. - * - * panel_size (input) int - * A panel consists of at most panel_size consecutive columns. - * - * etree (input) int*, dimension (A->ncol) - * Elimination tree of A'*A. - * Note: etree is a vector of parent pointers for a forest whose - * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. - * On input, the columns of A should be permuted so that the - * etree is in a certain postorder. - * - * work (input/output) void*, size (lwork) (in bytes) - * User-supplied work space and space for the output data structures. - * Not referenced if lwork = 0; - * - * lwork (input) int - * Specifies the size of work array in bytes. - * = 0: allocate space internally by system malloc; - * > 0: use user-supplied work array of length lwork in bytes, - * returns error if space runs out. - * = -1: the routine guesses the amount of space needed without - * performing the factorization, and returns it in - * *info; no other side effects. - * - * perm_c (input) int*, dimension (A->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * When searching for diagonal, perm_c[*] is applied to the - * row subscripts of A, so that diagonal threshold pivoting - * can find the diagonal of A, rather than that of A*Pc. - * - * perm_r (input/output) int*, dimension (A->nrow) - * Row permutation vector which defines the permutation matrix Pr, - * perm_r[i] = j means row i of A is in position j in Pr*A. - * If options->Fact = SamePattern_SameRowPerm, the pivoting routine - * will try to use the input perm_r, unless a certain threshold - * criterion is violated. In that case, perm_r is overwritten by - * a new permutation determined by partial pivoting or diagonal - * threshold pivoting. - * Otherwise, perm_r is output argument; - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = SLU_NC, - * Dtype = SLU_Z, Mtype = SLU_TRU. - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * > 0: if info = i, and i is - * <= A->ncol: U(i,i) is exactly zero. The factorization has - * been completed, but the factor U is exactly singular, - * and division by zero will occur if it is used to solve a - * system of equations. - * > A->ncol: number of bytes allocated when memory allocation - * failure occurred, plus A->ncol. If lwork = -1, it is - * the estimated amount of space needed, plus A->ncol. - * - * ====================================================================== - * - * Local Working Arrays: - * ====================== - * m = number of rows in the matrix - * n = number of columns in the matrix - * - * xprune[0:n-1]: xprune[*] points to locations in subscript - * vector lsub[*]. For column i, xprune[i] denotes the point where - * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need - * to be traversed for symbolic factorization. - * - * marker[0:3*m-1]: marker[i] = j means that node i has been - * reached when working on column j. - * Storage: relative to original row subscripts - * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, - * see zpanel_dfs.c; marker2 is used for inner-factorization, - * see zcolumn_dfs.c. - * - * parent[0:m-1]: parent vector used during dfs - * Storage: relative to new row subscripts - * - * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) - * unexplored neighbor of i in lsub[*] - * - * segrep[0:nseg-1]: contains the list of supernodal representatives - * in topological order of the dfs. A supernode representative is the - * last column of a supernode. - * The maximum size of segrep[] is n. - * - * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a - * supernodal representative r, repfnz[r] is the location of the first - * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 - * indicates the supernode r has been explored. - * NOTE: There are W of them, each used for one column of a panel. - * - * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below - * the panel diagonal. These are filled in during zpanel_dfs(), and are - * used later in the inner LU factorization within the panel. - * panel_lsub[]/dense[] pair forms the SPA data structure. - * NOTE: There are W of them. - * - * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; - * NOTE: there are W of them. - * - * tempv[0:*]: real temporary used for dense numeric kernels; - * The size of this array is defined by NUM_TEMPV() in zsp_defs.h. - * - */ - /* Local working arrays */ - NCPformat *Astore; - int *iperm_r = NULL; /* inverse of perm_r; used when - options->Fact == SamePattern_SameRowPerm */ - int *iperm_c; /* inverse of perm_c */ - int *iwork; - doublecomplex *zwork; - int *segrep, *repfnz, *parent, *xplore; - int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ - int *xprune; - int *marker; - doublecomplex *dense, *tempv; - int *relax_end; - doublecomplex *a; - int *asub; - int *xa_begin, *xa_end; - int *xsup, *supno; - int *xlsub, *xlusup, *xusub; - int nzlumax; - static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ - - /* Local scalars */ - fact_t fact = options->Fact; - double diag_pivot_thresh = options->DiagPivotThresh; - int pivrow; /* pivotal row number in the original matrix A */ - int nseg1; /* no of segments in U-column above panel row jcol */ - int nseg; /* no of segments in each U-column */ - register int jcol; - register int kcol; /* end column of a relaxed snode */ - register int icol; - register int i, k, jj, new_next, iinfo; - int m, n, min_mn, jsupno, fsupc, nextlu, nextu; - int w_def; /* upper bound on panel width */ - int usepr, iperm_r_allocated = 0; - int nnzL, nnzU; - int *panel_histo = stat->panel_histo; - flops_t *ops = stat->ops; - - iinfo = 0; - m = A->nrow; - n = A->ncol; - min_mn = SUPERLU_MIN(m, n); - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - - /* Allocate storage common to the factor routines */ - *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, - panel_size, L, U, &Glu, &iwork, &zwork); - if ( *info ) return; - - xsup = Glu.xsup; - supno = Glu.supno; - xlsub = Glu.xlsub; - xlusup = Glu.xlusup; - xusub = Glu.xusub; - - SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, - &repfnz, &panel_lsub, &xprune, &marker); - zSetRWork(m, panel_size, zwork, &dense, &tempv); - - usepr = (fact == SamePattern_SameRowPerm); - if ( usepr ) { - /* Compute the inverse of perm_r */ - iperm_r = (int *) intMalloc(m); - for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; - iperm_r_allocated = 1; - } - iperm_c = (int *) intMalloc(n); - for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; - - /* Identify relaxed snodes */ - relax_end = (int *) intMalloc(n); - if ( options->SymmetricMode == YES ) { - heap_relax_snode(n, etree, relax, marker, relax_end); - } else { - relax_snode(n, etree, relax, marker, relax_end); - } - - ifill (perm_r, m, EMPTY); - ifill (marker, m * NO_MARKER, EMPTY); - supno[0] = -1; - xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; - w_def = panel_size; - - /* - * Work on one "panel" at a time. A panel is one of the following: - * (a) a relaxed supernode at the bottom of the etree, or - * (b) panel_size contiguous columns, defined by the user - */ - for (jcol = 0; jcol < min_mn; ) { - - if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ - kcol = relax_end[jcol]; /* end of the relaxed snode */ - panel_histo[kcol-jcol+1]++; - - /* -------------------------------------- - * Factorize the relaxed supernode(jcol:kcol) - * -------------------------------------- */ - /* Determine the union of the row structure of the snode */ - if ( (*info = zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, - xprune, marker, &Glu)) != 0 ) - return; - - nextu = xusub[jcol]; - nextlu = xlusup[jcol]; - jsupno = supno[jcol]; - fsupc = xsup[jsupno]; - new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); - nzlumax = Glu.nzlumax; - while ( new_next > nzlumax ) { - if ( (*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) - return; - } - - for (icol = jcol; icol<= kcol; icol++) { - xusub[icol+1] = nextu; - - /* Scatter into SPA dense[*] */ - for (k = xa_begin[icol]; k < xa_end[icol]; k++) - dense[asub[k]] = a[k]; - - /* Numeric update within the snode */ - zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); - - if ( (*info = zpivotL(icol, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - -#ifdef DEBUG - zprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); -#endif - - } - - jcol = icol; - - } else { /* Work on one panel of panel_size columns */ - - /* Adjust panel_size so that a panel won't overlap with the next - * relaxed snode. - */ - panel_size = w_def; - for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) - if ( relax_end[k] != EMPTY ) { - panel_size = k - jcol; - break; - } - if ( k == min_mn ) panel_size = min_mn - jcol; - panel_histo[panel_size]++; - - /* symbolic factor on a panel of columns */ - zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, - dense, panel_lsub, segrep, repfnz, xprune, - marker, parent, xplore, &Glu); - - /* numeric sup-panel updates in topological order */ - zpanel_bmod(m, panel_size, jcol, nseg1, dense, - tempv, segrep, repfnz, &Glu, stat); - - /* Sparse LU within the panel, and below panel diagonal */ - for ( jj = jcol; jj < jcol + panel_size; jj++) { - k = (jj - jcol) * m; /* column index for w-wide arrays */ - - nseg = nseg1; /* Begin after all the panel segments */ - - if ((*info = zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], - segrep, &repfnz[k], xprune, marker, - parent, xplore, &Glu)) != 0) return; - - /* Numeric updates */ - if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], - tempv, &segrep[nseg1], &repfnz[k], - jcol, &Glu, stat)) != 0) return; - - /* Copy the U-segments to ucol[*] */ - if ((*info = zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], - perm_r, &dense[k], &Glu)) != 0) - return; - - if ( (*info = zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, - iperm_r, iperm_c, &pivrow, &Glu, stat)) ) - if ( iinfo == 0 ) iinfo = *info; - - /* Prune columns (0:jj-1) using column jj */ - zpruneL(jj, perm_r, pivrow, nseg, segrep, - &repfnz[k], xprune, &Glu); - - /* Reset repfnz[] for this column */ - resetrep_col (nseg, segrep, &repfnz[k]); - -#ifdef DEBUG - zprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); -#endif - - } - - jcol += panel_size; /* Move to the next panel */ - - } /* else */ - - } /* for */ - - *info = iinfo; - - if ( m > n ) { - k = 0; - for (i = 0; i < m; ++i) - if ( perm_r[i] == EMPTY ) { - perm_r[i] = n + k; - ++k; - } - } - - countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); - fixupL(min_mn, perm_r, &Glu); - - zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */ - - if ( fact == SamePattern_SameRowPerm ) { - /* L and U structures may have changed due to possibly different - pivoting, even though the storage is available. - There could also be memory expansions, so the array locations - may have changed, */ - ((SCformat *)L->Store)->nnz = nnzL; - ((SCformat *)L->Store)->nsuper = Glu.supno[n]; - ((SCformat *)L->Store)->nzval = Glu.lusup; - ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; - ((SCformat *)L->Store)->rowind = Glu.lsub; - ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; - ((NCformat *)U->Store)->nnz = nnzU; - ((NCformat *)U->Store)->nzval = Glu.ucol; - ((NCformat *)U->Store)->rowind = Glu.usub; - ((NCformat *)U->Store)->colptr = Glu.xusub; - } else { - zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, - Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, - Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU); - zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, - Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU); - } - - ops[FACT] += ops[TRSV] + ops[GEMV]; - - if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); - SUPERLU_FREE (iperm_c); - SUPERLU_FREE (relax_end); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,345 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - - -/* - * Function prototypes - */ -void zusolve(int, int, doublecomplex*, doublecomplex*); -void zlsolve(int, int, doublecomplex*, doublecomplex*); -void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); - - -void -zgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, - int *perm_c, int *perm_r, SuperMatrix *B, - SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * ZGSTRS solves a system of linear equations A*X=B or A'*X=B - * with A sparse and B dense, using the LU factorization computed by - * ZGSTRF. - * - * See supermatrix.h for the definition of 'SuperMatrix' structure. - * - * Arguments - * ========= - * - * trans (input) trans_t - * Specifies the form of the system of equations: - * = NOTRANS: A * X = B (No transpose) - * = TRANS: A'* X = B (Transpose) - * = CONJ: A**H * X = B (Conjugate transpose) - * - * L (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U as computed by - * zgstrf(). Use compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. - * - * U (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U as computed by - * zgstrf(). Use column-wise storage scheme, i.e., U has types: - * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. - * - * perm_c (input) int*, dimension (L->ncol) - * Column permutation vector, which defines the - * permutation matrix Pc; perm_c[i] = j means column i of A is - * in position j in A*Pc. - * - * perm_r (input) int*, dimension (L->nrow) - * Row permutation vector, which defines the permutation matrix Pr; - * perm_r[i] = j means row i of A is in position j in Pr*A. - * - * B (input/output) SuperMatrix* - * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. - * On entry, the right hand side matrix. - * On exit, the solution matrix if info = 0; - * - * stat (output) SuperLUStat_t* - * Record the statistics on runtime and floating-point operation count. - * See util.h for the definition of 'SuperLUStat_t'. - * - * info (output) int* - * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value - * - */ -#ifdef _CRAY - _fcd ftcs1, ftcs2, ftcs3, ftcs4; -#endif - int incx = 1, incy = 1; -#ifdef USE_VENDOR_BLAS - doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; - doublecomplex *work_col; -#endif - doublecomplex temp_comp; - DNformat *Bstore; - doublecomplex *Bmat; - SCformat *Lstore; - NCformat *Ustore; - doublecomplex *Lval, *Uval; - int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; - int i, j, k, iptr, jcol, n, ldb, nrhs; - doublecomplex *work, *rhs_work, *soln; - flops_t solve_ops; - void zprint_soln(); - - /* Test input parameters ... */ - *info = 0; - Bstore = B->Store; - ldb = Bstore->lda; - nrhs = B->ncol; - if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; - else if ( L->nrow != L->ncol || L->nrow < 0 || - L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) - *info = -2; - else if ( U->nrow != U->ncol || U->nrow < 0 || - U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) - *info = -3; - else if ( ldb < SUPERLU_MAX(0, L->nrow) || - B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) - *info = -6; - if ( *info ) { - i = -(*info); - xerbla_("zgstrs", &i); - return; - } - - n = L->nrow; - work = doublecomplexCalloc(n * nrhs); - if ( !work ) ABORT("Malloc fails for local work[]."); - soln = doublecomplexMalloc(n); - if ( !soln ) ABORT("Malloc fails for local soln[]."); - - Bmat = Bstore->nzval; - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( trans == NOTRANS ) { - /* Permute right hand sides to form Pr*B */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - /* Forward solve PLy=Pb. */ - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - nrow = nsupr - nsupc; - - solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; - solve_ops += 8 * nrow * nsupc * nrhs; - - if ( nsupc == 1 ) { - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - luptr = L_NZ_START(fsupc); - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ - irow = L_SUB(iptr); - ++luptr; - zz_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); - z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); - } - } - } else { - luptr = L_NZ_START(fsupc); -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); - CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#else - ztrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); - - zgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, - &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, - &beta, &work[0], &n ); -#endif - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - work_col = &work[j*n]; - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - z_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); - work_col[i].r = 0.0; - work_col[i].i = 0.0; - iptr++; - } - } -#else - for (j = 0; j < nrhs; j++) { - rhs_work = &Bmat[j*ldb]; - zlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); - zmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], - &rhs_work[fsupc], &work[0] ); - - iptr = istart + nsupc; - for (i = 0; i < nrow; i++) { - irow = L_SUB(iptr); - z_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); - work[i].r = 0.; - work[i].i = 0.; - iptr++; - } - } -#endif - } /* else ... */ - } /* for L-solve */ - -#ifdef DEBUG - printf("After L-solve: y=\n"); - zprint_soln(n, nrhs, Bmat); -#endif - - /* - * Back solve Ux=y. - */ - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; - - if ( nsupc == 1 ) { - rhs_work = &Bmat[0]; - for (j = 0; j < nrhs; j++) { - z_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); - rhs_work += ldb; - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("U", strlen("U")); - ftcs3 = _cptofcd("N", strlen("N")); - CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#else - ztrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, - &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); -#endif -#else - for (j = 0; j < nrhs; j++) - zusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); -#endif - } - - for (j = 0; j < nrhs; ++j) { - rhs_work = &Bmat[j*ldb]; - for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ - irow = U_SUB(i); - zz_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); - z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); - } - } - } - - } /* for U-solve */ - -#ifdef DEBUG - printf("After U-solve: x=\n"); - zprint_soln(n, nrhs, Bmat); -#endif - - /* Compute the final solution X := Pc*X. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = solve_ops; - - } else { /* Solve A'*X=B or CONJ(A)*X=B */ - /* Permute right hand sides to form Pc'*B. */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - stat->ops[SOLVE] = 0; - if (trans == TRANS) { - for (k = 0; k < nrhs; ++k) { - /* Multiply by inv(U'). */ - sp_ztrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); - - /* Multiply by inv(L'). */ - sp_ztrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); - } - } else { /* trans == CONJ */ - for (k = 0; k < nrhs; ++k) { - /* Multiply by conj(inv(U')). */ - sp_ztrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); - - /* Multiply by conj(inv(L')). */ - sp_ztrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); - } - } - /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ - for (i = 0; i < nrhs; i++) { - rhs_work = &Bmat[i*ldb]; - for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; - for (k = 0; k < n; k++) rhs_work[k] = soln[k]; - } - - } - - SUPERLU_FREE(work); - SUPERLU_FREE(soln); -} - -/* - * Diagnostic print of the solution vector - */ -void -zprint_soln(int n, int nrhs, doublecomplex *soln) -{ - int i; - - for (i = 0; i < n; i++) - printf("\t%d: %.4f\n", i, soln[i]); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlacon.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlacon.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlacon.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,213 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include "slu_Cnames.h" -#include "slu_dcomplex.h" - -int -zlacon_(int *n, doublecomplex *v, doublecomplex *x, double *est, int *kase) - -{ -/* - Purpose - ======= - - ZLACON estimates the 1-norm of a square matrix A. - Reverse communication is used for evaluating matrix-vector products. - - - Arguments - ========= - - N (input) INT - The order of the matrix. N >= 1. - - V (workspace) DOUBLE COMPLEX PRECISION array, dimension (N) - On the final return, V = A*W, where EST = norm(V)/norm(W) - (W is not returned). - - X (input/output) DOUBLE COMPLEX PRECISION array, dimension (N) - On an intermediate return, X should be overwritten by - A * X, if KASE=1, - A' * X, if KASE=2, - where A' is the conjugate transpose of A, - and ZLACON must be re-called with all the other parameters - unchanged. - - - EST (output) DOUBLE PRECISION - An estimate (a lower bound) for norm(A). - - KASE (input/output) INT - On the initial call to ZLACON, KASE should be 0. - On an intermediate return, KASE will be 1 or 2, indicating - whether X should be overwritten by A * X or A' * X. - On the final return from ZLACON, KASE will again be 0. - - Further Details - ======= ======= - - Contributed by Nick Higham, University of Manchester. - Originally named CONEST, dated March 16, 1988. - - Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of - a real or complex matrix, with applications to condition estimation", - ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. - ===================================================================== -*/ - - /* Table of constant values */ - int c__1 = 1; - doublecomplex zero = {0.0, 0.0}; - doublecomplex one = {1.0, 0.0}; - - /* System generated locals */ - double d__1; - - /* Local variables */ - static int iter; - static int jump, jlast; - static double altsgn, estold; - static int i, j; - double temp; - double safmin; - extern double dlamch_(char *); - extern int izmax1_(int *, doublecomplex *, int *); - extern double dzsum1_(int *, doublecomplex *, int *); - - safmin = dlamch_("Safe minimum"); - if ( *kase == 0 ) { - for (i = 0; i < *n; ++i) { - x[i].r = 1. / (double) (*n); - x[i].i = 0.; - } - *kase = 1; - jump = 1; - return 0; - } - - switch (jump) { - case 1: goto L20; - case 2: goto L40; - case 3: goto L70; - case 4: goto L110; - case 5: goto L140; - } - - /* ................ ENTRY (JUMP = 1) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ - L20: - if (*n == 1) { - v[0] = x[0]; - *est = z_abs(&v[0]); - /* ... QUIT */ - goto L150; - } - *est = dzsum1_(n, x, &c__1); - - for (i = 0; i < *n; ++i) { - d__1 = z_abs(&x[i]); - if (d__1 > safmin) { - d__1 = 1 / d__1; - x[i].r *= d__1; - x[i].i *= d__1; - } else { - x[i] = one; - } - } - *kase = 2; - jump = 2; - return 0; - - /* ................ ENTRY (JUMP = 2) - FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ -L40: - j = izmax1_(n, &x[0], &c__1); - --j; - iter = 2; - - /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ -L50: - for (i = 0; i < *n; ++i) x[i] = zero; - x[j] = one; - *kase = 1; - jump = 3; - return 0; - - /* ................ ENTRY (JUMP = 3) - X HAS BEEN OVERWRITTEN BY A*X. */ -L70: -#ifdef _CRAY - CCOPY(n, x, &c__1, v, &c__1); -#else - zcopy_(n, x, &c__1, v, &c__1); -#endif - estold = *est; - *est = dzsum1_(n, v, &c__1); - - -L90: - /* TEST FOR CYCLING. */ - if (*est <= estold) goto L120; - - for (i = 0; i < *n; ++i) { - d__1 = z_abs(&x[i]); - if (d__1 > safmin) { - d__1 = 1 / d__1; - x[i].r *= d__1; - x[i].i *= d__1; - } else { - x[i] = one; - } - } - *kase = 2; - jump = 4; - return 0; - - /* ................ ENTRY (JUMP = 4) - X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ -L110: - jlast = j; - j = izmax1_(n, &x[0], &c__1); - --j; - if (x[jlast].r != (d__1 = x[j].r, fabs(d__1)) && iter < 5) { - ++iter; - goto L50; - } - - /* ITERATION COMPLETE. FINAL STAGE. */ -L120: - altsgn = 1.; - for (i = 1; i <= *n; ++i) { - x[i-1].r = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); - x[i-1].i = 0.; - altsgn = -altsgn; - } - *kase = 1; - jump = 5; - return 0; - - /* ................ ENTRY (JUMP = 5) - X HAS BEEN OVERWRITTEN BY A*X. */ -L140: - temp = dzsum1_(n, x, &c__1) / (double)(*n * 3) * 2.; - if (temp > *est) { -#ifdef _CRAY - CCOPY(n, &x[0], &c__1, &v[0], &c__1); -#else - zcopy_(n, &x[0], &c__1, &v[0], &c__1); -#endif - *est = temp; - } - -L150: - *kase = 0; - return 0; - -} /* zlacon_ */ diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlangs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlangs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlangs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,110 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: zlangs.c - * History: Modified from lapack routine ZLANGE - */ -#include -#include "slu_zdefs.h" - -double zlangs(char *norm, SuperMatrix *A) -{ -/* - Purpose - ======= - - ZLANGS returns the value of the one norm, or the Frobenius norm, or - the infinity norm, or the element of largest absolute value of a - real matrix A. - - Description - =========== - - ZLANGE returns the value - - ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' - ( - ( norm1(A), NORM = '1', 'O' or 'o' - ( - ( normI(A), NORM = 'I' or 'i' - ( - ( normF(A), NORM = 'F', 'f', 'E' or 'e' - - where norm1 denotes the one norm of a matrix (maximum column sum), - normI denotes the infinity norm of a matrix (maximum row sum) and - normF denotes the Frobenius norm of a matrix (square root of sum of - squares). Note that max(abs(A(i,j))) is not a matrix norm. - - Arguments - ========= - - NORM (input) CHARACTER*1 - Specifies the value to be returned in ZLANGE as described above. - A (input) SuperMatrix* - The M by N sparse matrix A. - - ===================================================================== -*/ - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int i, j, irow; - double value, sum; - double *rwork; - - Astore = A->Store; - Aval = Astore->nzval; - - if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { - value = 0.; - - } else if (lsame_(norm, "M")) { - /* Find max(abs(A(i,j))). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - value = SUPERLU_MAX( value, z_abs( &Aval[i]) ); - - } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { - /* Find norm1(A). */ - value = 0.; - for (j = 0; j < A->ncol; ++j) { - sum = 0.; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) - sum += z_abs( &Aval[i] ); - value = SUPERLU_MAX(value,sum); - } - - } else if (lsame_(norm, "I")) { - /* Find normI(A). */ - if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) - ABORT("SUPERLU_MALLOC fails for rwork."); - for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { - irow = Astore->rowind[i]; - rwork[irow] += z_abs( &Aval[i] ); - } - value = 0.; - for (i = 0; i < A->nrow; ++i) - value = SUPERLU_MAX(value, rwork[i]); - - SUPERLU_FREE (rwork); - - } else if (lsame_(norm, "F") || lsame_(norm, "E")) { - /* Find normF(A). */ - ABORT("Not implemented."); - } else - ABORT("Illegal norm specified."); - - return (value); - -} /* zlangs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlaqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlaqgs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlaqgs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlaqgs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,138 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: zlaqgs.c - * History: Modified from LAPACK routine ZLAQGE - */ -#include -#include "slu_zdefs.h" - -void -zlaqgs(SuperMatrix *A, double *r, double *c, - double rowcnd, double colcnd, double amax, char *equed) -{ -/* - Purpose - ======= - - ZLAQGS equilibrates a general sparse M by N matrix A using the row and - scaling factors in the vectors R and C. - - See supermatrix.h for the definition of 'SuperMatrix' structure. - - Arguments - ========= - - A (input/output) SuperMatrix* - On exit, the equilibrated matrix. See EQUED for the form of - the equilibrated matrix. The type of A can be: - Stype = NC; Dtype = SLU_Z; Mtype = GE. - - R (input) double*, dimension (A->nrow) - The row scale factors for A. - - C (input) double*, dimension (A->ncol) - The column scale factors for A. - - ROWCND (input) double - Ratio of the smallest R(i) to the largest R(i). - - COLCND (input) double - Ratio of the smallest C(i) to the largest C(i). - - AMAX (input) double - Absolute value of largest matrix entry. - - EQUED (output) char* - Specifies the form of equilibration that was done. - = 'N': No equilibration - = 'R': Row equilibration, i.e., A has been premultiplied by - diag(R). - = 'C': Column equilibration, i.e., A has been postmultiplied - by diag(C). - = 'B': Both row and column equilibration, i.e., A has been - replaced by diag(R) * A * diag(C). - - Internal Parameters - =================== - - THRESH is a threshold value used to decide if row or column scaling - should be done based on the ratio of the row or column scaling - factors. If ROWCND < THRESH, row scaling is done, and if - COLCND < THRESH, column scaling is done. - - LARGE and SMALL are threshold values used to decide if row scaling - should be done based on the absolute size of the largest matrix - element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. - - ===================================================================== -*/ - -#define THRESH (0.1) - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int i, j, irow; - double large, small, cj; - extern double dlamch_(char *); - double temp; - - - /* Quick return if possible */ - if (A->nrow <= 0 || A->ncol <= 0) { - *(unsigned char *)equed = 'N'; - return; - } - - Astore = A->Store; - Aval = Astore->nzval; - - /* Initialize LARGE and SMALL. */ - small = dlamch_("Safe minimum") / dlamch_("Precision"); - large = 1. / small; - - if (rowcnd >= THRESH && amax >= small && amax <= large) { - if (colcnd >= THRESH) - *(unsigned char *)equed = 'N'; - else { - /* Column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - zd_mult(&Aval[i], &Aval[i], cj); - } - } - *(unsigned char *)equed = 'C'; - } - } else if (colcnd >= THRESH) { - /* Row scaling, no column scaling */ - for (j = 0; j < A->ncol; ++j) - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - zd_mult(&Aval[i], &Aval[i], r[irow]); - } - *(unsigned char *)equed = 'R'; - } else { - /* Row and column scaling */ - for (j = 0; j < A->ncol; ++j) { - cj = c[j]; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - temp = cj * r[irow]; - zd_mult(&Aval[i], &Aval[i], temp); - } - } - *(unsigned char *)equed = 'B'; - } - - return; - -} /* zlaqgs */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmemory.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmemory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmemory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,680 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -#include "slu_zdefs.h" - -/* Constants */ -#define NO_MEMTYPE 4 /* 0: lusup; - 1: ucol; - 2: lsub; - 3: usub */ -#define GluIntArray(n) (5 * (n) + 5) - -/* Internal prototypes */ -void *zexpand (int *, MemType,int, int, GlobalLU_t *); -int zLUWorkInit (int, int, int, int **, doublecomplex **, LU_space_t); -void copy_mem_doublecomplex (int, void *, void *); -void zStackCompress (GlobalLU_t *); -void zSetupSpace (void *, int, LU_space_t *); -void *zuser_malloc (int, int); -void zuser_free (int, int); - -/* External prototypes (in memory.c - prec-indep) */ -extern void copy_mem_int (int, void *, void *); -extern void user_bcopy (char *, char *, int); - -/* Headers for 4 types of dynamatically managed memory */ -typedef struct e_node { - int size; /* length of the memory that has been used */ - void *mem; /* pointer to the new malloc'd store */ -} ExpHeader; - -typedef struct { - int size; - int used; - int top1; /* grow upward, relative to &array[0] */ - int top2; /* grow downward */ - void *array; -} LU_stack_t; - -/* Variables local to this file */ -static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */ -static LU_stack_t stack; -static int no_expand; - -/* Macros to manipulate stack */ -#define StackFull(x) ( x + stack.used >= stack.size ) -#define NotDoubleAlign(addr) ( (long int)addr & 7 ) -#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) -#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ - (w + 1) * m * sizeof(doublecomplex) ) -#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ - - - - -/* - * Setup the memory model to be used for factorization. - * lwork = 0: use system malloc; - * lwork > 0: use user-supplied work[] space. - */ -void zSetupSpace(void *work, int lwork, LU_space_t *MemModel) -{ - if ( lwork == 0 ) { - *MemModel = SYSTEM; /* malloc/free */ - } else if ( lwork > 0 ) { - *MemModel = USER; /* user provided space */ - stack.used = 0; - stack.top1 = 0; - stack.top2 = (lwork/4)*4; /* must be word addressable */ - stack.size = stack.top2; - stack.array = (void *) work; - } -} - - - -void *zuser_malloc(int bytes, int which_end) -{ - void *buf; - - if ( StackFull(bytes) ) return (NULL); - - if ( which_end == HEAD ) { - buf = (char*) stack.array + stack.top1; - stack.top1 += bytes; - } else { - stack.top2 -= bytes; - buf = (char*) stack.array + stack.top2; - } - - stack.used += bytes; - return buf; -} - - -void zuser_free(int bytes, int which_end) -{ - if ( which_end == HEAD ) { - stack.top1 -= bytes; - } else { - stack.top2 += bytes; - } - stack.used -= bytes; -} - - - -/* - * mem_usage consists of the following fields: - * - for_lu (float) - * The amount of space used in bytes for the L\U data structures. - * - total_needed (float) - * The amount of space needed in bytes to perform factorization. - * - expansions (int) - * Number of memory expansions during the LU factorization. - */ -int zQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) -{ - SCformat *Lstore; - NCformat *Ustore; - register int n, iword, dword, panel_size = sp_ienv(1); - - Lstore = L->Store; - Ustore = U->Store; - n = L->ncol; - iword = sizeof(int); - dword = sizeof(doublecomplex); - - /* For LU factors */ - mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] * - dword + Lstore->rowind_colptr[n] * iword ); - mem_usage->for_lu += (float)( (n + 1) * iword + - Ustore->colptr[n] * (dword + iword) ); - - /* Working storage to support factorization */ - mem_usage->total_needed = mem_usage->for_lu + - (float)( (2 * panel_size + 4 + NO_MARKER) * n * iword + - (panel_size + 1) * n * dword ); - - mem_usage->expansions = --no_expand; - - return 0; -} /* zQuerySpace */ - -/* - * Allocate storage for the data structures common to all factor routines. - * For those unpredictable size, make a guess as FILL * nnz(A). - * Return value: - * If lwork = -1, return the estimated amount of space required, plus n; - * otherwise, return the amount of space actually allocated when - * memory allocation failure occurred. - */ -int -zLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, - int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, - int **iwork, doublecomplex **dwork) -{ - int info, iword, dword; - SCformat *Lstore; - NCformat *Ustore; - int *xsup, *supno; - int *lsub, *xlsub; - doublecomplex *lusup; - int *xlusup; - doublecomplex *ucol; - int *usub, *xusub; - int nzlmax, nzumax, nzlumax; - int FILL = sp_ienv(6); - - Glu->n = n; - no_expand = 0; - iword = sizeof(int); - dword = sizeof(doublecomplex); - - if ( !expanders ) - expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader)); - if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); - - if ( fact != SamePattern_SameRowPerm ) { - /* Guess for L\U factors */ - nzumax = nzlumax = FILL * annz; - nzlmax = SUPERLU_MAX(1, FILL/4.) * annz; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else { - zSetupSpace(work, lwork, &Glu->MemModel); - } - -#if ( PRNTlevel >= 1 ) - printf("zLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", - FILL, nzlmax, nzumax); - fflush(stdout); -#endif - - /* Integer pointers for L\U factors */ - if ( Glu->MemModel == SYSTEM ) { - xsup = intMalloc(n+1); - supno = intMalloc(n+1); - xlsub = intMalloc(n+1); - xlusup = intMalloc(n+1); - xusub = intMalloc(n+1); - } else { - xsup = (int *)zuser_malloc((n+1) * iword, HEAD); - supno = (int *)zuser_malloc((n+1) * iword, HEAD); - xlsub = (int *)zuser_malloc((n+1) * iword, HEAD); - xlusup = (int *)zuser_malloc((n+1) * iword, HEAD); - xusub = (int *)zuser_malloc((n+1) * iword, HEAD); - } - - lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) zexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) zexpand( &nzumax, USUB, 0, 1, Glu ); - - while ( !lusup || !ucol || !lsub || !usub ) { - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE(lusup); - SUPERLU_FREE(ucol); - SUPERLU_FREE(lsub); - SUPERLU_FREE(usub); - } else { - zuser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD); - } - nzlumax /= 2; - nzumax /= 2; - nzlmax /= 2; - if ( nzlumax < annz ) { - printf("Not enough memory to perform factorization.\n"); - return (zmemory_usage(nzlmax, nzumax, nzlumax, n) + n); - } -#if ( PRNTlevel >= 1) - printf("zLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", - nzlmax, nzumax); - fflush(stdout); -#endif - lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu ); - ucol = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu ); - lsub = (int *) zexpand( &nzlmax, LSUB, 0, 0, Glu ); - usub = (int *) zexpand( &nzumax, USUB, 0, 1, Glu ); - } - - } else { - /* fact == SamePattern_SameRowPerm */ - Lstore = L->Store; - Ustore = U->Store; - xsup = Lstore->sup_to_col; - supno = Lstore->col_to_sup; - xlsub = Lstore->rowind_colptr; - xlusup = Lstore->nzval_colptr; - xusub = Ustore->colptr; - nzlmax = Glu->nzlmax; /* max from previous factorization */ - nzumax = Glu->nzumax; - nzlumax = Glu->nzlumax; - - if ( lwork == -1 ) { - return ( GluIntArray(n) * iword + TempSpace(m, panel_size) - + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); - } else if ( lwork == 0 ) { - Glu->MemModel = SYSTEM; - } else { - Glu->MemModel = USER; - stack.top2 = (lwork/4)*4; /* must be word-addressable */ - stack.size = stack.top2; - } - - lsub = expanders[LSUB].mem = Lstore->rowind; - lusup = expanders[LUSUP].mem = Lstore->nzval; - usub = expanders[USUB].mem = Ustore->rowind; - ucol = expanders[UCOL].mem = Ustore->nzval;; - expanders[LSUB].size = nzlmax; - expanders[LUSUP].size = nzlumax; - expanders[USUB].size = nzumax; - expanders[UCOL].size = nzumax; - } - - Glu->xsup = xsup; - Glu->supno = supno; - Glu->lsub = lsub; - Glu->xlsub = xlsub; - Glu->lusup = lusup; - Glu->xlusup = xlusup; - Glu->ucol = ucol; - Glu->usub = usub; - Glu->xusub = xusub; - Glu->nzlmax = nzlmax; - Glu->nzumax = nzumax; - Glu->nzlumax = nzlumax; - - info = zLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel); - if ( info ) - return ( info + zmemory_usage(nzlmax, nzumax, nzlumax, n) + n); - - ++no_expand; - return 0; - -} /* zLUMemInit */ - -/* Allocate known working storage. Returns 0 if success, otherwise - returns the number of bytes allocated so far when failure occurred. */ -int -zLUWorkInit(int m, int n, int panel_size, int **iworkptr, - doublecomplex **dworkptr, LU_space_t MemModel) -{ - int isize, dsize, extra; - doublecomplex *old_ptr; - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - - isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); - dsize = (m * panel_size + - NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(doublecomplex); - - if ( MemModel == SYSTEM ) - *iworkptr = (int *) intCalloc(isize/sizeof(int)); - else - *iworkptr = (int *) zuser_malloc(isize, TAIL); - if ( ! *iworkptr ) { - fprintf(stderr, "zLUWorkInit: malloc fails for local iworkptr[]\n"); - return (isize + n); - } - - if ( MemModel == SYSTEM ) - *dworkptr = (doublecomplex *) SUPERLU_MALLOC(dsize); - else { - *dworkptr = (doublecomplex *) zuser_malloc(dsize, TAIL); - if ( NotDoubleAlign(*dworkptr) ) { - old_ptr = *dworkptr; - *dworkptr = (doublecomplex*) DoubleAlign(*dworkptr); - *dworkptr = (doublecomplex*) ((double*)*dworkptr - 1); - extra = (char*)old_ptr - (char*)*dworkptr; -#ifdef DEBUG - printf("zLUWorkInit: not aligned, extra %d\n", extra); -#endif - stack.top2 -= extra; - stack.used += extra; - } - } - if ( ! *dworkptr ) { - fprintf(stderr, "malloc fails for local dworkptr[]."); - return (isize + dsize + n); - } - - return 0; -} - - -/* - * Set up pointers for real working arrays. - */ -void -zSetRWork(int m, int panel_size, doublecomplex *dworkptr, - doublecomplex **dense, doublecomplex **tempv) -{ - doublecomplex zero = {0.0, 0.0}; - - int maxsuper = sp_ienv(3), - rowblk = sp_ienv(4); - *dense = dworkptr; - *tempv = *dense + panel_size*m; - zfill (*dense, m * panel_size, zero); - zfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); -} - -/* - * Free the working storage used by factor routines. - */ -void zLUWorkFree(int *iwork, doublecomplex *dwork, GlobalLU_t *Glu) -{ - if ( Glu->MemModel == SYSTEM ) { - SUPERLU_FREE (iwork); - SUPERLU_FREE (dwork); - } else { - stack.used -= (stack.size - stack.top2); - stack.top2 = stack.size; -/* zStackCompress(Glu); */ - } - - SUPERLU_FREE (expanders); - expanders = 0; -} - -/* Expand the data structures for L and U during the factorization. - * Return value: 0 - successful return - * > 0 - number of bytes allocated when run out of space - */ -int -zLUMemXpand(int jcol, - int next, /* number of elements currently in the factors */ - MemType mem_type, /* which type of memory to expand */ - int *maxlen, /* modified - maximum length of a data structure */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - void *new_mem; - -#ifdef DEBUG - printf("zLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", - jcol, next, *maxlen, mem_type); -#endif - - if (mem_type == USUB) - new_mem = zexpand(maxlen, mem_type, next, 1, Glu); - else - new_mem = zexpand(maxlen, mem_type, next, 0, Glu); - - if ( !new_mem ) { - int nzlmax = Glu->nzlmax; - int nzumax = Glu->nzumax; - int nzlumax = Glu->nzlumax; - fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); - return (zmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); - } - - switch ( mem_type ) { - case LUSUP: - Glu->lusup = (doublecomplex *) new_mem; - Glu->nzlumax = *maxlen; - break; - case UCOL: - Glu->ucol = (doublecomplex *) new_mem; - Glu->nzumax = *maxlen; - break; - case LSUB: - Glu->lsub = (int *) new_mem; - Glu->nzlmax = *maxlen; - break; - case USUB: - Glu->usub = (int *) new_mem; - Glu->nzumax = *maxlen; - break; - } - - return 0; - -} - - - -void -copy_mem_doublecomplex(int howmany, void *old, void *new) -{ - register int i; - doublecomplex *dold = old; - doublecomplex *dnew = new; - for (i = 0; i < howmany; i++) dnew[i] = dold[i]; -} - -/* - * Expand the existing storage to accommodate more fill-ins. - */ -void -*zexpand ( - int *prev_len, /* length used from previous call */ - MemType type, /* which part of the memory to expand */ - int len_to_copy, /* size of the memory to be copied to new store */ - int keep_prev, /* = 1: use prev_len; - = 0: compute new_len to expand */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ - float EXPAND = 1.5; - float alpha; - void *new_mem, *old_mem; - int new_len, tries, lword, extra, bytes_to_copy; - - alpha = EXPAND; - - if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ - new_len = *prev_len; - else { - new_len = alpha * *prev_len; - } - - if ( type == LSUB || type == USUB ) lword = sizeof(int); - else lword = sizeof(doublecomplex); - - if ( Glu->MemModel == SYSTEM ) { - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - if ( no_expand != 0 ) { - tries = 0; - if ( keep_prev ) { - if ( !new_mem ) return (NULL); - } else { - while ( !new_mem ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); - } - } - if ( type == LSUB || type == USUB ) { - copy_mem_int(len_to_copy, expanders[type].mem, new_mem); - } else { - copy_mem_doublecomplex(len_to_copy, expanders[type].mem, new_mem); - } - SUPERLU_FREE (expanders[type].mem); - } - expanders[type].mem = (void *) new_mem; - - } else { /* MemModel == USER */ - if ( no_expand == 0 ) { - new_mem = zuser_malloc(new_len * lword, HEAD); - if ( NotDoubleAlign(new_mem) && - (type == LUSUP || type == UCOL) ) { - old_mem = new_mem; - new_mem = (void *)DoubleAlign(new_mem); - extra = (char*)new_mem - (char*)old_mem; -#ifdef DEBUG - printf("expand(): not aligned, extra %d\n", extra); -#endif - stack.top1 += extra; - stack.used += extra; - } - expanders[type].mem = (void *) new_mem; - } - else { - tries = 0; - extra = (new_len - *prev_len) * lword; - if ( keep_prev ) { - if ( StackFull(extra) ) return (NULL); - } else { - while ( StackFull(extra) ) { - if ( ++tries > 10 ) return (NULL); - alpha = Reduce(alpha); - new_len = alpha * *prev_len; - extra = (new_len - *prev_len) * lword; - } - } - - if ( type != USUB ) { - new_mem = (void*)((char*)expanders[type + 1].mem + extra); - bytes_to_copy = (char*)stack.array + stack.top1 - - (char*)expanders[type + 1].mem; - user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); - - if ( type < USUB ) { - Glu->usub = expanders[USUB].mem = - (void*)((char*)expanders[USUB].mem + extra); - } - if ( type < LSUB ) { - Glu->lsub = expanders[LSUB].mem = - (void*)((char*)expanders[LSUB].mem + extra); - } - if ( type < UCOL ) { - Glu->ucol = expanders[UCOL].mem = - (void*)((char*)expanders[UCOL].mem + extra); - } - stack.top1 += extra; - stack.used += extra; - if ( type == UCOL ) { - stack.top1 += extra; /* Add same amount for USUB */ - stack.used += extra; - } - - } /* if ... */ - - } /* else ... */ - } - - expanders[type].size = new_len; - *prev_len = new_len; - if ( no_expand ) ++no_expand; - - return (void *) expanders[type].mem; - -} /* zexpand */ - - -/* - * Compress the work[] array to remove fragmentation. - */ -void -zStackCompress(GlobalLU_t *Glu) -{ - register int iword, dword, ndim; - char *last, *fragment; - int *ifrom, *ito; - doublecomplex *dfrom, *dto; - int *xlsub, *lsub, *xusub, *usub, *xlusup; - doublecomplex *ucol, *lusup; - - iword = sizeof(int); - dword = sizeof(doublecomplex); - ndim = Glu->n; - - xlsub = Glu->xlsub; - lsub = Glu->lsub; - xusub = Glu->xusub; - usub = Glu->usub; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - lusup = Glu->lusup; - - dfrom = ucol; - dto = (doublecomplex *)((char*)lusup + xlusup[ndim] * dword); - copy_mem_doublecomplex(xusub[ndim], dfrom, dto); - ucol = dto; - - ifrom = lsub; - ito = (int *) ((char*)ucol + xusub[ndim] * iword); - copy_mem_int(xlsub[ndim], ifrom, ito); - lsub = ito; - - ifrom = usub; - ito = (int *) ((char*)lsub + xlsub[ndim] * iword); - copy_mem_int(xusub[ndim], ifrom, ito); - usub = ito; - - last = (char*)usub + xusub[ndim] * iword; - fragment = (char*) (((char*)stack.array + stack.top1) - last); - stack.used -= (long int) fragment; - stack.top1 -= (long int) fragment; - - Glu->ucol = ucol; - Glu->lsub = lsub; - Glu->usub = usub; - -#ifdef DEBUG - printf("zStackCompress: fragment %d\n", fragment); - /* for (last = 0; last < ndim; ++last) - print_lu_col("After compress:", last, 0);*/ -#endif - -} - -/* - * Allocate storage for original matrix A - */ -void -zallocateA(int n, int nnz, doublecomplex **a, int **asub, int **xa) -{ - *a = (doublecomplex *) doublecomplexMalloc(nnz); - *asub = (int *) intMalloc(nnz); - *xa = (int *) intMalloc(n+1); -} - - -doublecomplex *doublecomplexMalloc(int n) -{ - doublecomplex *buf; - buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in doublecomplexMalloc()\n"); - } - return (buf); -} - -doublecomplex *doublecomplexCalloc(int n) -{ - doublecomplex *buf; - register int i; - doublecomplex zero = {0.0, 0.0}; - buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); - if ( !buf ) { - ABORT("SUPERLU_MALLOC failed for buf in doublecomplexCalloc()\n"); - } - for (i = 0; i < n; ++i) buf[i] = zero; - return (buf); -} - - -int zmemory_usage(const int nzlmax, const int nzumax, - const int nzlumax, const int n) -{ - register int iword, dword; - - iword = sizeof(int); - dword = sizeof(doublecomplex); - - return (10 * n * iword + - nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmyblas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmyblas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmyblas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,182 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: zmyblas2.c - * Purpose: - * Level 2 BLAS operations: solves and matvec, written in C. - * Note: - * This is only used when the system lacks an efficient BLAS library. - */ -#include "slu_dcomplex.h" - -/* - * Solves a dense UNIT lower triangular system. The unit lower - * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). - * The solution will be returned in the rhs vector. - */ -void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs ) -{ - int k; - doublecomplex x0, x1, x2, x3, temp; - doublecomplex *M0; - doublecomplex *Mki0, *Mki1, *Mki2, *Mki3; - register int firstcol = 0; - - M0 = &M[0]; - - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - Mki2 = Mki1 + ldm + 1; - Mki3 = Mki2 + ldm + 1; - - x0 = rhs[firstcol]; - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x1, &rhs[firstcol+1], &temp); - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x2, &rhs[firstcol+2], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&x2, &x2, &temp); - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x3, &rhs[firstcol+3], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&x3, &x3, &temp); - zz_mult(&temp, &x2, Mki2); Mki2++; - z_sub(&x3, &x3, &temp); - - rhs[++firstcol] = x1; - rhs[++firstcol] = x2; - rhs[++firstcol] = x3; - ++firstcol; - - for (k = firstcol; k < ncol; k++) { - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x2, Mki2); Mki2++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x3, Mki3); Mki3++; - z_sub(&rhs[k], &rhs[k], &temp); - } - - M0 += 4 * ldm + 4; - } - - if ( firstcol < ncol - 1 ) { /* Do 2 columns */ - Mki0 = M0 + 1; - Mki1 = Mki0 + ldm + 1; - - x0 = rhs[firstcol]; - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&x1, &rhs[firstcol+1], &temp); - - rhs[++firstcol] = x1; - ++firstcol; - - for (k = firstcol; k < ncol; k++) { - zz_mult(&temp, &x0, Mki0); Mki0++; - z_sub(&rhs[k], &rhs[k], &temp); - zz_mult(&temp, &x1, Mki1); Mki1++; - z_sub(&rhs[k], &rhs[k], &temp); - } - } - -} - -/* - * Solves a dense upper triangular system. The upper triangular matrix is - * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned - * in the rhs vector. - */ -void -zusolve ( ldm, ncol, M, rhs ) -int ldm; /* in */ -int ncol; /* in */ -doublecomplex *M; /* in */ -doublecomplex *rhs; /* modified */ -{ - doublecomplex xj, temp; - int jcol, j, irow; - - jcol = ncol - 1; - - for (j = 0; j < ncol; j++) { - - z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */ - rhs[jcol] = xj; - - for (irow = 0; irow < jcol; irow++) { - zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */ - z_sub(&rhs[irow], &rhs[irow], &temp); - } - - jcol--; - - } -} - - -/* - * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. - * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. - */ -void zmatvec ( ldm, nrow, ncol, M, vec, Mxvec ) -int ldm; /* in -- leading dimension of M */ -int nrow; /* in */ -int ncol; /* in */ -doublecomplex *M; /* in */ -doublecomplex *vec; /* in */ -doublecomplex *Mxvec; /* in/out */ -{ - doublecomplex vi0, vi1, vi2, vi3; - doublecomplex *M0, temp; - doublecomplex *Mki0, *Mki1, *Mki2, *Mki3; - register int firstcol = 0; - int k; - - M0 = &M[0]; - - while ( firstcol < ncol - 3 ) { /* Do 4 columns */ - Mki0 = M0; - Mki1 = Mki0 + ldm; - Mki2 = Mki1 + ldm; - Mki3 = Mki2 + ldm; - - vi0 = vec[firstcol++]; - vi1 = vec[firstcol++]; - vi2 = vec[firstcol++]; - vi3 = vec[firstcol++]; - for (k = 0; k < nrow; k++) { - zz_mult(&temp, &vi0, Mki0); Mki0++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - zz_mult(&temp, &vi1, Mki1); Mki1++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - zz_mult(&temp, &vi2, Mki2); Mki2++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - zz_mult(&temp, &vi3, Mki3); Mki3++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - } - - M0 += 4 * ldm; - } - - while ( firstcol < ncol ) { /* Do 1 column */ - Mki0 = M0; - vi0 = vec[firstcol++]; - for (k = 0; k < nrow; k++) { - zz_mult(&temp, &vi0, Mki0); Mki0++; - z_add(&Mxvec[k], &Mxvec[k], &temp); - } - M0 += ldm; - } - -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,478 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_zdefs.h" - -/* - * Function prototypes - */ -void zlsolve(int, int, doublecomplex *, doublecomplex *); -void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *); -extern void zcheck_tempv(); - -void -zpanel_bmod ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - const int nseg, /* in */ - doublecomplex *dense, /* out, of size n by w */ - doublecomplex *tempv, /* working array */ - int *segrep, /* in */ - int *repfnz, /* in, of size n by w */ - GlobalLU_t *Glu, /* modified */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * - * Performs numeric block updates (sup-panel) in topological order. - * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. - * Special processing on the supernodal portion of L\U[*,j] - * - * Before entering this routine, the original nonzeros in the panel - * were already copied into the spa[m,w]. - * - * Updated/Output parameters- - * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned - * collectively in the m-by-w vector dense[*]. - * - */ - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - int incx = 1, incy = 1; - doublecomplex alpha, beta; -#endif - - register int k, ksub; - int fsupc, nsupc, nsupr, nrow; - int krep, krep_ind; - doublecomplex ukj, ukj1, ukj2; - int luptr, luptr1, luptr2; - int segsze; - int block_nrow; /* no of rows in a block row */ - register int lptr; /* Points to the row subscripts of a supernode */ - int kfnz, irow, no_zeros; - register int isub, isub1, i; - register int jj; /* Index through each column in the panel */ - int *xsup, *supno; - int *lsub, *xlsub; - doublecomplex *lusup; - int *xlusup; - int *repfnz_col; /* repfnz[] for a column in the panel */ - doublecomplex *dense_col; /* dense[] for a column in the panel */ - doublecomplex *tempv1; /* Used in 1-D update */ - doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */ - doublecomplex zero = {0.0, 0.0}; - doublecomplex one = {1.0, 0.0}; - doublecomplex comp_temp, comp_temp1; - register int ldaTmp; - register int r_ind, r_hi; - static int first = 1, maxsuper, rowblk, colblk; - flops_t *ops = stat->ops; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - if ( first ) { - maxsuper = sp_ienv(3); - rowblk = sp_ienv(4); - colblk = sp_ienv(5); - first = 0; - } - ldaTmp = maxsuper + rowblk; - - /* - * For each nonz supernode segment of U[*,j] in topological order - */ - k = nseg - 1; - for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ - - /* krep = representative of current k-th supernode - * fsupc = first supernodal column - * nsupc = no of columns in a supernode - * nsupr = no of rows in a supernode - */ - krep = segrep[k--]; - fsupc = xsup[supno[krep]]; - nsupc = krep - fsupc + 1; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nrow = nsupr - nsupc; - lptr = xlsub[fsupc]; - krep_ind = lptr + nsupc - 1; - - repfnz_col = repfnz; - dense_col = dense; - - if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ - - TriTmp = tempv; - - /* Sequence through each column in panel -- triangular solves */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += 4 * segsze * (segsze - 1); - ops[GEMV] += 8 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); - z_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); - z_sub(&ukj1, &ukj1, &comp_temp); - - zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); - zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - luptr++; luptr1++; luptr2++; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } - - } else { /* segsze >= 4 */ - - /* Copy U[*,j] segment from dense[*] to TriTmp[*], which - holds the result of triangular solves. */ - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - TriTmp[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#else - ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, TriTmp, &incx ); -#endif -#else - zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); -#endif - - - } /* else ... */ - - } /* for jj ... end tri-solves */ - - /* Block row updates; push all the way into dense[*] block */ - for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { - - r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); - block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); - luptr = xlusup[fsupc] + nsupc + r_ind; - isub1 = lptr + nsupc + r_ind; - - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - /* Sequence through each column in panel -- matrix-vector */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - /* Perform a block update, and scatter the result of - matrix-vector to dense[]. */ - no_zeros = kfnz - fsupc; - luptr1 = luptr + nsupr * no_zeros; - MatvecTmp = &TriTmp[maxsuper]; - -#ifdef USE_VENDOR_BLAS - alpha = one; - beta = zero; -#ifdef _CRAY - CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#else - zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], - &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); -#endif -#else - zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], - TriTmp, MatvecTmp); -#endif - - /* Scatter MatvecTmp[*] into SPA dense[*] temporarily - * such that MatvecTmp[*] can be re-used for the - * the next blok row update. dense[] will be copied into - * global store after the whole panel has been finished. - */ - isub = isub1; - for (i = 0; i < block_nrow; i++) { - irow = lsub[isub]; - z_sub(&dense_col[irow], &dense_col[irow], - &MatvecTmp[i]); - MatvecTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } /* for each block row ... */ - - /* Scatter the triangular solves into SPA dense[*] */ - repfnz_col = repfnz; - TriTmp = tempv; - dense_col = dense; - - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - if ( segsze <= 3 ) continue; /* skip unrolled cases */ - - no_zeros = kfnz - fsupc; - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = TriTmp[i]; - TriTmp[i] = zero; - ++isub; - } - - } /* for jj ... */ - - } else { /* 1-D block modification */ - - - /* Sequence through each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++, - repfnz_col += m, dense_col += m) { - - kfnz = repfnz_col[krep]; - if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ - - segsze = krep - kfnz + 1; - luptr = xlusup[fsupc]; - - ops[TRSV] += 4 * segsze * (segsze - 1); - ops[GEMV] += 8 * nrow * segsze; - - /* Case 1: Update U-segment of size 1 -- col-col update */ - if ( segsze == 1 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc; - - for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { - irow = lsub[i]; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - ++luptr; - } - - } else if ( segsze <= 3 ) { - ukj = dense_col[lsub[krep_ind]]; - luptr += nsupr*(nsupc-1) + nsupc-1; - ukj1 = dense_col[lsub[krep_ind - 1]]; - luptr1 = luptr - nsupr; - - if ( segsze == 2 ) { - zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); - z_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } else { - ukj2 = dense_col[lsub[krep_ind - 2]]; - luptr2 = luptr1 - nsupr; - zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); - z_sub(&ukj1, &ukj1, &comp_temp); - - zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); - zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&ukj, &ukj, &comp_temp); - dense_col[lsub[krep_ind]] = ukj; - dense_col[lsub[krep_ind-1]] = ukj1; - for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - ++luptr; ++luptr1; ++luptr2; - zz_mult(&comp_temp, &ukj, &lusup[luptr]); - zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); - z_add(&comp_temp, &comp_temp, &comp_temp1); - z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); - } - } - - } else { /* segsze >= 4 */ - /* - * Perform a triangular solve and block update, - * then scatter the result of sup-col update to dense[]. - */ - no_zeros = kfnz - fsupc; - - /* Copy U[*,j] segment from dense[*] to tempv[*]: - * The result of triangular solve is in tempv[*]; - * The result of matrix vector update is in dense_col[*] - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; ++i) { - irow = lsub[isub]; - tempv[i] = dense_col[irow]; /* Gather */ - ++isub; - } - - /* start effective triangle */ - luptr += nsupr * no_zeros + no_zeros; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#else - ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], - &nsupr, tempv, &incx ); -#endif - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - alpha = one; - beta = zero; -#ifdef _CRAY - CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#else - zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], - &nsupr, tempv, &incx, &beta, tempv1, &incy ); -#endif -#else - zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); - - luptr += segsze; /* Dense matrix-vector */ - tempv1 = &tempv[segsze]; - zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); -#endif - - /* Scatter tempv[*] into SPA dense[*] temporarily, such - * that tempv[*] can be used for the triangular solve of - * the next column of the panel. They will be copied into - * ucol[*] after the whole panel has been finished. - */ - isub = lptr + no_zeros; - for (i = 0; i < segsze; i++) { - irow = lsub[isub]; - dense_col[irow] = tempv[i]; - tempv[i] = zero; - isub++; - } - - /* Scatter the update from tempv1[*] into SPA dense[*] */ - /* Start dense rectangular L */ - for (i = 0; i < nrow; i++) { - irow = lsub[isub]; - z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); - tempv1[i] = zero; - ++isub; - } - - } /* else segsze>=4 ... */ - - } /* for each column in the panel... */ - - } /* else 1-D update ... */ - - } /* for each updating supernode ... */ - -} - - - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,247 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - -void -zpanel_dfs ( - const int m, /* in - number of rows in the matrix */ - const int w, /* in */ - const int jcol, /* in */ - SuperMatrix *A, /* in - original matrix */ - int *perm_r, /* in */ - int *nseg, /* out */ - doublecomplex *dense, /* out */ - int *panel_lsub, /* out */ - int *segrep, /* out */ - int *repfnz, /* out */ - int *xprune, /* out */ - int *marker, /* out */ - int *parent, /* working array */ - int *xplore, /* working array */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* - * Purpose - * ======= - * - * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). - * - * A supernode representative is the last column of a supernode. - * The nonzeros in U[*,j] are segments that end at supernodal - * representatives. - * - * The routine returns one list of the supernodal representatives - * in topological order of the dfs that generates them. This list is - * a superset of the topological order of each individual column within - * the panel. - * The location of the first nonzero in each supernodal segment - * (supernodal entry location) is also returned. Each column has a - * separate list for this purpose. - * - * Two marker arrays are used for dfs: - * marker[i] == jj, if i was visited during dfs of current column jj; - * marker1[i] >= jcol, if i was visited by earlier columns in this panel; - * - * marker: A-row --> A-row/col (0/1) - * repfnz: SuperA-col --> PA-row - * parent: SuperA-col --> SuperA-col - * xplore: SuperA-col --> index to L-structure - * - */ - NCPformat *Astore; - doublecomplex *a; - int *asub; - int *xa_begin, *xa_end; - int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; - int k, krow, kmark, kperm; - int xdfs, maxdfs, kpar; - int jj; /* index through each column in the panel */ - int *marker1; /* marker1[jj] >= jcol if vertex jj was visited - by a previous column within this panel. */ - int *repfnz_col; /* start of each column in the panel */ - doublecomplex *dense_col; /* start of each column in the panel */ - int nextl_col; /* next available position in panel_lsub[*,jj] */ - int *xsup, *supno; - int *lsub, *xlsub; - - /* Initialize pointers */ - Astore = A->Store; - a = Astore->nzval; - asub = Astore->rowind; - xa_begin = Astore->colbeg; - xa_end = Astore->colend; - marker1 = marker + m; - repfnz_col = repfnz; - dense_col = dense; - *nseg = 0; - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - - /* For each column in the panel */ - for (jj = jcol; jj < jcol + w; jj++) { - nextl_col = (jj - jcol) * m; - -#ifdef CHK_DFS - printf("\npanel col %d: ", jj); -#endif - - /* For each nonz in A[*,jj] do dfs */ - for (k = xa_begin[jj]; k < xa_end[jj]; k++) { - krow = asub[k]; - dense_col[krow] = a[k]; - kmark = marker[krow]; - if ( kmark == jj ) - continue; /* krow visited before, go to the next nonzero */ - - /* For each unmarked nbr krow of jj - * krow is in L: place it in structure of L[*,jj] - */ - marker[krow] = jj; - kperm = perm_r[krow]; - - if ( kperm == EMPTY ) { - panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ - } - /* - * krow is in U: if its supernode-rep krep - * has been explored, update repfnz[*] - */ - else { - - krep = xsup[supno[kperm]+1] - 1; - myfnz = repfnz_col[krep]; - -#ifdef CHK_DFS - printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); -#endif - if ( myfnz != EMPTY ) { /* Representative visited before */ - if ( myfnz > kperm ) repfnz_col[krep] = kperm; - /* continue; */ - } - else { - /* Otherwise, perform dfs starting at krep */ - oldrep = EMPTY; - parent[krep] = oldrep; - repfnz_col[krep] = kperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - do { - /* - * For each unmarked kchild of krep - */ - while ( xdfs < maxdfs ) { - - kchild = lsub[xdfs]; - xdfs++; - chmark = marker[kchild]; - - if ( chmark != jj ) { /* Not reached yet */ - marker[kchild] = jj; - chperm = perm_r[kchild]; - - /* Case kchild is in L: place it in L[*,j] */ - if ( chperm == EMPTY ) { - panel_lsub[nextl_col++] = kchild; - } - /* Case kchild is in U: - * chrep = its supernode-rep. If its rep has - * been explored, update its repfnz[*] - */ - else { - - chrep = xsup[supno[chperm]+1] - 1; - myfnz = repfnz_col[chrep]; -#ifdef CHK_DFS - printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); -#endif - if ( myfnz != EMPTY ) { /* Visited before */ - if ( myfnz > chperm ) - repfnz_col[chrep] = chperm; - } - else { - /* Cont. dfs at snode-rep of kchild */ - xplore[krep] = xdfs; - oldrep = krep; - krep = chrep; /* Go deeper down G(L) */ - parent[krep] = oldrep; - repfnz_col[krep] = chperm; - xdfs = xlsub[krep]; - maxdfs = xprune[krep]; -#ifdef CHK_DFS - printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } /* else */ - - } /* else */ - - } /* if... */ - - } /* while xdfs < maxdfs */ - - /* krow has no more unexplored nbrs: - * Place snode-rep krep in postorder DFS, if this - * segment is seen for the first time. (Note that - * "repfnz[krep]" may change later.) - * Backtrack dfs to its parent. - */ - if ( marker1[krep] < jcol ) { - segrep[*nseg] = krep; - ++(*nseg); - marker1[krep] = jj; - } - - kpar = parent[krep]; /* Pop stack, mimic recursion */ - if ( kpar == EMPTY ) break; /* dfs done */ - krep = kpar; - xdfs = xplore[krep]; - maxdfs = xprune[krep]; - -#ifdef CHK_DFS - printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); - for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); - printf("\n"); -#endif - } while ( kpar != EMPTY ); /* do-while - until empty stack */ - - } /* else */ - - } /* else */ - - } /* for each nonz in A[*,jj] */ - - repfnz_col += m; /* Move to next column */ - dense_col += m; - - } /* for jj ... */ - -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,108 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include "slu_zdefs.h" - -double -zPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, - SuperMatrix *L, SuperMatrix *U) -{ -/* - * Purpose - * ======= - * - * Compute the reciprocal pivot growth factor of the leading ncols columns - * of the matrix, using the formula: - * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) - * - * Arguments - * ========= - * - * ncols (input) int - * The number of columns of matrices A, L and U. - * - * A (input) SuperMatrix* - * Original matrix A, permuted by columns, of dimension - * (A->nrow, A->ncol). The type of A can be: - * Stype = NC; Dtype = SLU_Z; Mtype = GE. - * - * L (output) SuperMatrix* - * The factor L from the factorization Pr*A=L*U; use compressed row - * subscripts storage for supernodes, i.e., L has type: - * Stype = SC; Dtype = SLU_Z; Mtype = TRLU. - * - * U (output) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise - * storage scheme, i.e., U has types: Stype = NC; - * Dtype = SLU_Z; Mtype = TRU. - * - */ - NCformat *Astore; - SCformat *Lstore; - NCformat *Ustore; - doublecomplex *Aval, *Lval, *Uval; - int fsupc, nsupr, luptr, nz_in_U; - int i, j, k, oldcol; - int *inv_perm_c; - double rpg, maxaj, maxuj; - extern double dlamch_(char *); - double smlnum; - doublecomplex *luval; - doublecomplex temp_comp; - - /* Get machine constants. */ - smlnum = dlamch_("S"); - rpg = 1. / smlnum; - - Astore = A->Store; - Lstore = L->Store; - Ustore = U->Store; - Aval = Astore->nzval; - Lval = Lstore->nzval; - Uval = Ustore->nzval; - - inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); - for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; - - for (k = 0; k <= Lstore->nsuper; ++k) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - luptr = L_NZ_START(fsupc); - luval = &Lval[luptr]; - nz_in_U = 1; - - for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { - maxaj = 0.; - oldcol = inv_perm_c[j]; - for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) - maxaj = SUPERLU_MAX( maxaj, z_abs1( &Aval[i]) ); - - maxuj = 0.; - for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) - maxuj = SUPERLU_MAX( maxuj, z_abs1( &Uval[i]) ); - - /* Supernode */ - for (i = 0; i < nz_in_U; ++i) - maxuj = SUPERLU_MAX( maxuj, z_abs1( &luval[i]) ); - - ++nz_in_U; - luval += nsupr; - - if ( maxuj == 0. ) - rpg = SUPERLU_MIN( rpg, 1.); - else - rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); - } - - if ( j >= ncols ) break; - } - - SUPERLU_FREE(inv_perm_c); - return (rpg); -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,174 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include -#include "slu_zdefs.h" - -#undef DEBUG - -int -zpivotL( - const int jcol, /* in */ - const double u, /* in - diagonal pivoting threshold */ - int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ - int *perm_r, /* may be modified */ - int *iperm_r, /* in - inverse of perm_r */ - int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ - int *pivrow, /* out */ - GlobalLU_t *Glu, /* modified - global LU data structures */ - SuperLUStat_t *stat /* output */ - ) -{ -/* - * Purpose - * ======= - * Performs the numerical pivoting on the current column of L, - * and the CDIV operation. - * - * Pivot policy: - * (1) Compute thresh = u * max_(i>=j) abs(A_ij); - * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN - * pivot row = k; - * ELSE IF abs(A_jj) >= thresh THEN - * pivot row = j; - * ELSE - * pivot row = m; - * - * Note: If you absolutely want to use a given pivot order, then set u=0.0. - * - * Return value: 0 success; - * i > 0 U(i,i) is exactly zero. - * - */ - doublecomplex one = {1.0, 0.0}; - int fsupc; /* first column in the supernode */ - int nsupc; /* no of columns in the supernode */ - int nsupr; /* no of rows in the supernode */ - int lptr; /* points to the starting subscript of the supernode */ - int pivptr, old_pivptr, diag, diagind; - double pivmax, rtemp, thresh; - doublecomplex temp; - doublecomplex *lu_sup_ptr; - doublecomplex *lu_col_ptr; - int *lsub_ptr; - int isub, icol, k, itemp; - int *lsub, *xlsub; - doublecomplex *lusup; - int *xlusup; - flops_t *ops = stat->ops; - - /* Initialize pointers */ - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; - nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ - lptr = xlsub[fsupc]; - nsupr = xlsub[fsupc+1] - lptr; - lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ - lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ - lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ - -#ifdef DEBUG -if ( jcol == MIN_COL ) { - printf("Before cdiv: col %d\n", jcol); - for (k = nsupc; k < nsupr; k++) - printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); -} -#endif - - /* Determine the largest abs numerical value for partial pivoting; - Also search for user-specified pivot, and diagonal element. */ - if ( *usepr ) *pivrow = iperm_r[jcol]; - diagind = iperm_c[jcol]; - pivmax = 0.0; - pivptr = nsupc; - diag = EMPTY; - old_pivptr = nsupc; - for (isub = nsupc; isub < nsupr; ++isub) { - rtemp = z_abs1 (&lu_col_ptr[isub]); - if ( rtemp > pivmax ) { - pivmax = rtemp; - pivptr = isub; - } - if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; - if ( lsub_ptr[isub] == diagind ) diag = isub; - } - - /* Test for singularity */ - if ( pivmax == 0.0 ) { - *pivrow = lsub_ptr[pivptr]; - perm_r[*pivrow] = jcol; - *usepr = 0; - return (jcol+1); - } - - thresh = u * pivmax; - - /* Choose appropriate pivotal element by our policy. */ - if ( *usepr ) { - rtemp = z_abs1 (&lu_col_ptr[old_pivptr]); - if ( rtemp != 0.0 && rtemp >= thresh ) - pivptr = old_pivptr; - else - *usepr = 0; - } - if ( *usepr == 0 ) { - /* Use diagonal pivot? */ - if ( diag >= 0 ) { /* diagonal exists */ - rtemp = z_abs1 (&lu_col_ptr[diag]); - if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; - } - *pivrow = lsub_ptr[pivptr]; - } - - /* Record pivot row */ - perm_r[*pivrow] = jcol; - - /* Interchange row subscripts */ - if ( pivptr != nsupc ) { - itemp = lsub_ptr[pivptr]; - lsub_ptr[pivptr] = lsub_ptr[nsupc]; - lsub_ptr[nsupc] = itemp; - - /* Interchange numerical values as well, for the whole snode, such - * that L is indexed the same way as A. - */ - for (icol = 0; icol <= nsupc; icol++) { - itemp = pivptr + icol * nsupr; - temp = lu_sup_ptr[itemp]; - lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; - lu_sup_ptr[nsupc + icol*nsupr] = temp; - } - } /* if */ - - /* cdiv operation */ - ops[FACT] += 10 * (nsupr - nsupc); - - z_div(&temp, &one, &lu_col_ptr[nsupc]); - for (k = nsupc+1; k < nsupr; k++) - zz_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpruneL.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpruneL.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpruneL.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,147 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - -void -zpruneL( - const int jcol, /* in */ - const int *perm_r, /* in */ - const int pivrow, /* in */ - const int nseg, /* in */ - const int *segrep, /* in */ - const int *repfnz, /* in */ - int *xprune, /* out */ - GlobalLU_t *Glu /* modified - global LU data structures */ - ) -{ -/* - * Purpose - * ======= - * Prunes the L-structure of supernodes whose L-structure - * contains the current pivot row "pivrow" - * - */ - doublecomplex utemp; - int jsupno, irep, irep1, kmin, kmax, krow, movnum; - int i, ktemp, minloc, maxloc; - int do_prune; /* logical variable */ - int *xsup, *supno; - int *lsub, *xlsub; - doublecomplex *lusup; - int *xlusup; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - /* - * For each supernode-rep irep in U[*,j] - */ - jsupno = supno[jcol]; - for (i = 0; i < nseg; i++) { - - irep = segrep[i]; - irep1 = irep + 1; - do_prune = FALSE; - - /* Don't prune with a zero U-segment */ - if ( repfnz[irep] == EMPTY ) - continue; - - /* If a snode overlaps with the next panel, then the U-segment - * is fragmented into two parts -- irep and irep1. We should let - * pruning occur at the rep-column in irep1's snode. - */ - if ( supno[irep] == supno[irep1] ) /* Don't prune */ - continue; - - /* - * If it has not been pruned & it has a nonz in row L[pivrow,i] - */ - if ( supno[irep] != jsupno ) { - if ( xprune[irep] >= xlsub[irep1] ) { - kmin = xlsub[irep]; - kmax = xlsub[irep1] - 1; - for (krow = kmin; krow <= kmax; krow++) - if ( lsub[krow] == pivrow ) { - do_prune = TRUE; - break; - } - } - - if ( do_prune ) { - - /* Do a quicksort-type partition - * movnum=TRUE means that the num values have to be exchanged. - */ - movnum = FALSE; - if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ - movnum = TRUE; - - while ( kmin <= kmax ) { - - if ( perm_r[lsub[kmax]] == EMPTY ) - kmax--; - else if ( perm_r[lsub[kmin]] != EMPTY ) - kmin++; - else { /* kmin below pivrow, and kmax above pivrow: - * interchange the two subscripts - */ - ktemp = lsub[kmin]; - lsub[kmin] = lsub[kmax]; - lsub[kmax] = ktemp; - - /* If the supernode has only one column, then we - * only keep one set of subscripts. For any subscript - * interchange performed, similar interchange must be - * done on the numerical values. - */ - if ( movnum ) { - minloc = xlusup[irep] + (kmin - xlsub[irep]); - maxloc = xlusup[irep] + (kmax - xlsub[irep]); - utemp = lusup[minloc]; - lusup[minloc] = lusup[maxloc]; - lusup[maxloc] = utemp; - } - - kmin++; - kmax--; - - } - - } /* while */ - - xprune[irep] = kmin; /* Pruning */ - -#ifdef CHK_PRUNE - printf(" After zpruneL(),using col %d: xprune[%d] = %d\n", - jcol, irep, kmin); -#endif - } /* if do_prune */ - - } /* if */ - - } /* for each U-segment... */ -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zreadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zreadhb.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zreadhb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zreadhb.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,265 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -#include -#include -#include "slu_zdefs.h" - - -/* Eat up the rest of the current line */ -int zDumpLine(FILE *fp) -{ - register int c; - while ((c = fgetc(fp)) != '\n') ; - return 0; -} - -int zParseIntFormat(char *buf, int *num, int *size) -{ - char *tmp; - - tmp = buf; - while (*tmp++ != '(') ; - sscanf(tmp, "%d", num); - while (*tmp != 'I' && *tmp != 'i') ++tmp; - ++tmp; - sscanf(tmp, "%d", size); - return 0; -} - -int zParseFloatFormat(char *buf, int *num, int *size) -{ - char *tmp, *period; - - tmp = buf; - while (*tmp++ != '(') ; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' - && *tmp != 'F' && *tmp != 'f') { - /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the - num picked up refers to P, which should be skipped. */ - if (*tmp=='p' || *tmp=='P') { - ++tmp; - *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ - } else { - ++tmp; - } - } - ++tmp; - period = tmp; - while (*period != '.' && *period != ')') ++period ; - *period = '\0'; - *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ - - return 0; -} - -int zReadVector(FILE *fp, int n, int *where, int perline, int persize) -{ - register int i, j, item; - char tmp, buf[100]; - - i = 0; - while (i < n) { - fgets(buf, 100, fp); /* read a line at a time */ - for (j=0; jops; - - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - - nextlu = xlusup[jcol]; - - /* - * Process the supernodal portion of L\U[*,j] - */ - for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { - irow = lsub[isub]; - lusup[nextlu] = dense[irow]; - dense[irow] = comp_zero; - ++nextlu; - } - - xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ - - if ( fsupc < jcol ) { - - luptr = xlusup[fsupc]; - nsupr = xlsub[fsupc+1] - xlsub[fsupc]; - nsupc = jcol - fsupc; /* Excluding jcol */ - ufirst = xlusup[jcol]; /* Points to the beginning of column - jcol in supernode L\U(jsupno). */ - nrow = nsupr - nsupc; - - ops[TRSV] += 4 * nsupc * (nsupc - 1); - ops[GEMV] += 8 * nrow * nsupc; - -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#else - ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, - &lusup[ufirst], &incx ); - zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, - &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); -#endif -#else - zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); - zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], - &lusup[ufirst], &tempv[0] ); - - /* Scatter tempv[*] into lusup[*] */ - iptr = ufirst + nsupc; - for (i = 0; i < nrow; i++) { - z_sub(&lusup[iptr], &lusup[iptr], &tempv[i]); - ++iptr; - tempv[i] = comp_zero; - } -#endif - - } - - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include "slu_zdefs.h" - -int -zsnode_dfs ( - const int jcol, /* in - start of the supernode */ - const int kcol, /* in - end of the supernode */ - const int *asub, /* in */ - const int *xa_begin, /* in */ - const int *xa_end, /* in */ - int *xprune, /* out */ - int *marker, /* modified */ - GlobalLU_t *Glu /* modified */ - ) -{ -/* Purpose - * ======= - * zsnode_dfs() - Determine the union of the row structures of those - * columns within the relaxed snode. - * Note: The relaxed snodes are leaves of the supernodal etree, therefore, - * the portion outside the rectangular supernode must be zero. - * - * Return value - * ============ - * 0 success; - * >0 number of bytes allocated when run out of memory. - * - */ - register int i, k, ifrom, ito, nextl, new_next; - int nsuper, krow, kmark, mem_error; - int *xsup, *supno; - int *lsub, *xlsub; - int nzlmax; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - nzlmax = Glu->nzlmax; - - nsuper = ++supno[jcol]; /* Next available supernode number */ - nextl = xlsub[jcol]; - - for (i = jcol; i <= kcol; i++) { - /* For each nonzero in A[*,i] */ - for (k = xa_begin[i]; k < xa_end[i]; k++) { - krow = asub[k]; - kmark = marker[krow]; - if ( kmark != kcol ) { /* First time visit krow */ - marker[krow] = kcol; - lsub[nextl++] = krow; - if ( nextl >= nzlmax ) { - if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - } - } - supno[i] = nsuper; - } - - /* Supernode > 1, then make a copy of the subscripts for pruning */ - if ( jcol < kcol ) { - new_next = nextl + (nextl - xlsub[jcol]); - while ( new_next > nzlmax ) { - if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) - return (mem_error); - lsub = Glu->lsub; - } - ito = nextl; - for (ifrom = xlsub[jcol]; ifrom < nextl; ) - lsub[ito++] = lsub[ifrom++]; - for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; - nextl = ito; - } - - xsup[nsuper+1] = kcol + 1; - supno[kcol+1] = nsuper; - xprune[kcol] = nextl; - xlsub[kcol+1] = nextl; - - return 0; -} - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas2.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,565 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - * File name: zsp_blas2.c - * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. - */ - -#include "slu_zdefs.h" - -/* - * Function prototypes - */ -void zusolve(int, int, doublecomplex*, doublecomplex*); -void zlsolve(int, int, doublecomplex*, doublecomplex*); -void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); - - -int -sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, - SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info) -{ -/* - * Purpose - * ======= - * - * sp_ztrsv() solves one of the systems of equations - * A*x = b, or A'*x = b, - * where b and x are n element vectors and A is a sparse unit , or - * non-unit, upper or lower triangular matrix. - * No test for singularity or near-singularity is included in this - * routine. Such tests must be performed before calling this routine. - * - * Parameters - * ========== - * - * uplo - (input) char* - * On entry, uplo specifies whether the matrix is an upper or - * lower triangular matrix as follows: - * uplo = 'U' or 'u' A is an upper triangular matrix. - * uplo = 'L' or 'l' A is a lower triangular matrix. - * - * trans - (input) char* - * On entry, trans specifies the equations to be solved as - * follows: - * trans = 'N' or 'n' A*x = b. - * trans = 'T' or 't' A'*x = b. - * trans = 'C' or 'c' A^H*x = b. - * - * diag - (input) char* - * On entry, diag specifies whether or not A is unit - * triangular as follows: - * diag = 'U' or 'u' A is assumed to be unit triangular. - * diag = 'N' or 'n' A is not assumed to be unit - * triangular. - * - * L - (input) SuperMatrix* - * The factor L from the factorization Pr*A*Pc=L*U. Use - * compressed row subscripts storage for supernodes, - * i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU. - * - * U - (input) SuperMatrix* - * The factor U from the factorization Pr*A*Pc=L*U. - * U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU. - * - * x - (input/output) doublecomplex* - * Before entry, the incremented array X must contain the n - * element right-hand side vector b. On exit, X is overwritten - * with the solution vector x. - * - * info - (output) int* - * If *info = -i, the i-th argument had an illegal value. - * - */ -#ifdef _CRAY - _fcd ftcs1 = _cptofcd("L", strlen("L")), - ftcs2 = _cptofcd("N", strlen("N")), - ftcs3 = _cptofcd("U", strlen("U")); -#endif - SCformat *Lstore; - NCformat *Ustore; - doublecomplex *Lval, *Uval; - int incx = 1, incy = 1; - doublecomplex temp; - doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; - doublecomplex comp_zero = {0.0, 0.0}; - int nrow; - int fsupc, nsupr, nsupc, luptr, istart, irow; - int i, k, iptr, jcol; - doublecomplex *work; - flops_t solve_ops; - - /* Test the input parameters */ - *info = 0; - if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; - else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && - !lsame_(trans, "C")) *info = -2; - else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; - else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; - else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; - if ( *info ) { - i = -(*info); - xerbla_("sp_ztrsv", &i); - return 0; - } - - Lstore = L->Store; - Lval = Lstore->nzval; - Ustore = U->Store; - Uval = Ustore->nzval; - solve_ops = 0; - - if ( !(work = doublecomplexCalloc(L->nrow)) ) - ABORT("Malloc fails for work in sp_ztrsv()."); - - if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L)*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - nrow = nsupr - nsupc; - - /* 1 z_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; - solve_ops += 8 * nrow * nsupc; - - if ( nsupc == 1 ) { - for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { - irow = L_SUB(iptr); - ++luptr; - zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]); - z_sub(&x[irow], &x[irow], &comp_zero); - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#else - ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); - - zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], - &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); -#endif -#else - zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); - - zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], - &x[fsupc], &work[0] ); -#endif - - iptr = istart + nsupc; - for (i = 0; i < nrow; ++i, ++iptr) { - irow = L_SUB(iptr); - z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ - work[i] = comp_zero; - - } - } - } /* for k ... */ - - } else { - /* Form x := inv(U)*x */ - - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; k--) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - /* 1 z_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; - - if ( nsupc == 1 ) { - z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); - for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { - irow = U_SUB(i); - zz_mult(&comp_zero, &x[fsupc], &Uval[i]); - z_sub(&x[irow], &x[irow], &comp_zero); - } - } else { -#ifdef USE_VENDOR_BLAS -#ifdef _CRAY - CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif -#else - zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); -#endif - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); - i++) { - irow = U_SUB(i); - zz_mult(&comp_zero, &x[jcol], &Uval[i]); - z_sub(&x[irow], &x[irow], &comp_zero); - } - } - } - } /* for k ... */ - - } - } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := inv(L')*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 8 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - zz_mult(&comp_zero, &x[irow], &Lval[i]); - z_sub(&x[jcol], &x[jcol], &comp_zero); - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += 4 * nsupc * (nsupc - 1); -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := inv(U')*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - zz_mult(&comp_zero, &x[irow], &Uval[i]); - z_sub(&x[jcol], &x[jcol], &comp_zero); - } - } - - /* 1 z_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; - - if ( nsupc == 1 ) { - z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); - } else { -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd("T", strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } else { /* Form x := conj(inv(A'))*x */ - - if ( lsame_(uplo, "L") ) { - /* Form x := conj(inv(L'))*x */ - if ( L->nrow == 0 ) return 0; /* Quick return */ - - for (k = Lstore->nsuper; k >= 0; --k) { - fsupc = L_FST_SUPC(k); - istart = L_SUB_START(fsupc); - nsupr = L_SUB_START(fsupc+1) - istart; - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - solve_ops += 8 * (nsupr - nsupc) * nsupc; - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - iptr = istart + nsupc; - for (i = L_NZ_START(jcol) + nsupc; - i < L_NZ_START(jcol+1); i++) { - irow = L_SUB(iptr); - zz_conj(&temp, &Lval[i]); - zz_mult(&comp_zero, &x[irow], &temp); - z_sub(&x[jcol], &x[jcol], &comp_zero); - iptr++; - } - } - - if ( nsupc > 1 ) { - solve_ops += 4 * nsupc * (nsupc - 1); -#ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd(trans, strlen("T")); - ftcs3 = _cptofcd("U", strlen("U")); - ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } - } else { - /* Form x := conj(inv(U'))*x */ - if ( U->nrow == 0 ) return 0; /* Quick return */ - - for (k = 0; k <= Lstore->nsuper; k++) { - fsupc = L_FST_SUPC(k); - nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); - nsupc = L_FST_SUPC(k+1) - fsupc; - luptr = L_NZ_START(fsupc); - - for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { - solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); - for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { - irow = U_SUB(i); - zz_conj(&temp, &Uval[i]); - zz_mult(&comp_zero, &x[irow], &temp); - z_sub(&x[jcol], &x[jcol], &comp_zero); - } - } - - /* 1 z_div costs 10 flops */ - solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; - - if ( nsupc == 1 ) { - zz_conj(&temp, &Lval[luptr]); - z_div(&x[fsupc], &x[fsupc], &temp); - } else { -#ifdef _CRAY - ftcs1 = _cptofcd("U", strlen("U")); - ftcs2 = _cptofcd(trans, strlen("T")); - ftcs3 = _cptofcd("N", strlen("N")); - ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#else - ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, - &x[fsupc], &incx); -#endif - } - } /* for k ... */ - } - } - - stat->ops[SOLVE] += solve_ops; - SUPERLU_FREE(work); - return 0; -} - - - -int -sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, - int incx, doublecomplex beta, doublecomplex *y, int incy) -{ -/* Purpose - ======= - - sp_zgemv() performs one of the matrix-vector operations - y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, - where alpha and beta are scalars, x and y are vectors and A is a - sparse A->nrow by A->ncol matrix. - - Parameters - ========== - - TRANS - (input) char* - On entry, TRANS specifies the operation to be performed as - follows: - TRANS = 'N' or 'n' y := alpha*A*x + beta*y. - TRANS = 'T' or 't' y := alpha*A'*x + beta*y. - TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. - - ALPHA - (input) doublecomplex - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Before entry, the leading m by n part of the array A must - contain the matrix of coefficients. - - X - (input) doublecomplex*, array of DIMENSION at least - ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. - Before entry, the incremented array X must contain the - vector x. - - INCX - (input) int - On entry, INCX specifies the increment for the elements of - X. INCX must not be zero. - - BETA - (input) doublecomplex - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then Y need not be set on input. - - Y - (output) doublecomplex*, array of DIMENSION at least - ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' - and at least - ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. - Before entry with BETA non-zero, the incremented array Y - must contain the vector y. On exit, Y is overwritten by the - updated vector y. - - INCY - (input) int - On entry, INCY specifies the increment for the elements of - Y. INCY must not be zero. - - ==== Sparse Level 2 Blas routine. -*/ - - /* Local variables */ - NCformat *Astore; - doublecomplex *Aval; - int info; - doublecomplex temp, temp1; - int lenx, leny, i, j, irow; - int iy, jx, jy, kx, ky; - int notran; - doublecomplex comp_zero = {0.0, 0.0}; - doublecomplex comp_one = {1.0, 0.0}; - - notran = lsame_(trans, "N"); - Astore = A->Store; - Aval = Astore->nzval; - - /* Test the input parameters */ - info = 0; - if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; - else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; - else if (incx == 0) info = 5; - else if (incy == 0) info = 8; - if (info != 0) { - xerbla_("sp_zgemv ", &info); - return 0; - } - - /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || - z_eq(&alpha, &comp_zero) && - z_eq(&beta, &comp_one)) - return 0; - - - /* Set LENX and LENY, the lengths of the vectors x and y, and set - up the start points in X and Y. */ - if (lsame_(trans, "N")) { - lenx = A->ncol; - leny = A->nrow; - } else { - lenx = A->nrow; - leny = A->ncol; - } - if (incx > 0) kx = 0; - else kx = - (lenx - 1) * incx; - if (incy > 0) ky = 0; - else ky = - (leny - 1) * incy; - - /* Start the operations. In this version the elements of A are - accessed sequentially with one pass through A. */ - /* First form y := beta*y. */ - if ( !z_eq(&beta, &comp_one) ) { - if (incy == 1) { - if ( z_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) y[i] = comp_zero; - else - for (i = 0; i < leny; ++i) - zz_mult(&y[i], &beta, &y[i]); - } else { - iy = ky; - if ( z_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) { - y[iy] = comp_zero; - iy += incy; - } - else - for (i = 0; i < leny; ++i) { - zz_mult(&y[iy], &beta, &y[iy]); - iy += incy; - } - } - } - - if ( z_eq(&alpha, &comp_zero) ) return 0; - - if ( notran ) { - /* Form y := alpha*A*x + y. */ - jx = kx; - if (incy == 1) { - for (j = 0; j < A->ncol; ++j) { - if ( !z_eq(&x[jx], &comp_zero) ) { - zz_mult(&temp, &alpha, &x[jx]); - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - zz_mult(&temp1, &temp, &Aval[i]); - z_add(&y[irow], &y[irow], &temp1); - } - } - jx += incx; - } - } else { - ABORT("Not implemented."); - } - } else { - /* Form y := alpha*A'*x + y. */ - jy = ky; - if (incx == 1) { - for (j = 0; j < A->ncol; ++j) { - temp = comp_zero; - for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { - irow = Astore->rowind[i]; - zz_mult(&temp1, &Aval[i], &x[irow]); - z_add(&temp, &temp, &temp1); - } - zz_mult(&temp1, &alpha, &temp); - z_add(&y[jy], &y[jy], &temp1); - jy += incy; - } - } else { - ABORT("Not implemented."); - } - } - return 0; -} /* sp_zgemv */ - diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas3.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ - -/* - * -- SuperLU routine (version 2.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * November 15, 1997 - * - */ -/* - * File name: sp_blas3.c - * Purpose: Sparse BLAS3, using some dense BLAS3 operations. - */ - -#include "slu_zdefs.h" - -int -sp_zgemm(char *transa, char *transb, int m, int n, int k, - doublecomplex alpha, SuperMatrix *A, doublecomplex *b, int ldb, - doublecomplex beta, doublecomplex *c, int ldc) -{ -/* Purpose - ======= - - sp_z performs one of the matrix-matrix operations - - C := alpha*op( A )*op( B ) + beta*C, - - where op( X ) is one of - - op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), - - alpha and beta are scalars, and A, B and C are matrices, with op( A ) - an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - - - Parameters - ========== - - TRANSA - (input) char* - On entry, TRANSA specifies the form of op( A ) to be used in - the matrix multiplication as follows: - TRANSA = 'N' or 'n', op( A ) = A. - TRANSA = 'T' or 't', op( A ) = A'. - TRANSA = 'C' or 'c', op( A ) = conjg( A' ). - Unchanged on exit. - - TRANSB - (input) char* - On entry, TRANSB specifies the form of op( B ) to be used in - the matrix multiplication as follows: - TRANSB = 'N' or 'n', op( B ) = B. - TRANSB = 'T' or 't', op( B ) = B'. - TRANSB = 'C' or 'c', op( B ) = conjg( B' ). - Unchanged on exit. - - M - (input) int - On entry, M specifies the number of rows of the matrix - op( A ) and of the matrix C. M must be at least zero. - Unchanged on exit. - - N - (input) int - On entry, N specifies the number of columns of the matrix - op( B ) and the number of columns of the matrix C. N must be - at least zero. - Unchanged on exit. - - K - (input) int - On entry, K specifies the number of columns of the matrix - op( A ) and the number of rows of the matrix op( B ). K must - be at least zero. - Unchanged on exit. - - ALPHA - (input) doublecomplex - On entry, ALPHA specifies the scalar alpha. - - A - (input) SuperMatrix* - Matrix A with a sparse format, of dimension (A->nrow, A->ncol). - Currently, the type of A can be: - Stype = NC or NCP; Dtype = SLU_Z; Mtype = GE. - In the future, more general A can be handled. - - B - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is - n when TRANSB = 'N' or 'n', and is k otherwise. - Before entry with TRANSB = 'N' or 'n', the leading k by n - part of the array B must contain the matrix B, otherwise - the leading n by k part of the array B must contain the - matrix B. - Unchanged on exit. - - LDB - (input) int - On entry, LDB specifies the first dimension of B as declared - in the calling (sub) program. LDB must be at least max( 1, n ). - Unchanged on exit. - - BETA - (input) doublecomplex - On entry, BETA specifies the scalar beta. When BETA is - supplied as zero then C need not be set on input. - - C - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ). - Before entry, the leading m by n part of the array C must - contain the matrix C, except when beta is zero, in which - case C need not be set on entry. - On exit, the array C is overwritten by the m by n matrix - ( alpha*op( A )*B + beta*C ). - - LDC - (input) int - On entry, LDC specifies the first dimension of C as declared - in the calling (sub)program. LDC must be at least max(1,m). - Unchanged on exit. - - ==== Sparse Level 3 Blas routine. -*/ - int incx = 1, incy = 1; - int j; - - for (j = 0; j < n; ++j) { - sp_zgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); - } - return 0; -} diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zutil.c --- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zutil.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zutil.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,485 +0,0 @@ - -/* - * -- SuperLU routine (version 3.0) -- - * Univ. of California Berkeley, Xerox Palo Alto Research Center, - * and Lawrence Berkeley National Lab. - * October 15, 2003 - * - */ -/* - Copyright (c) 1994 by Xerox Corporation. All rights reserved. - - THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY - EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. - - Permission is hereby granted to use or copy this program for any - purpose, provided the above notices are retained on all copies. - Permission to modify the code and to distribute modified code is - granted, provided the above notices are retained, and a notice that - the code was modified is included with the above copyright notice. -*/ - -#include -#include "slu_zdefs.h" - -void -zCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, - doublecomplex *nzval, int *rowind, int *colptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NCformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->rowind = rowind; - Astore->colptr = colptr; -} - -void -zCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, - doublecomplex *nzval, int *colind, int *rowptr, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - NRformat *Astore; - - A->Stype = stype; - A->Dtype = dtype; - A->Mtype = mtype; - A->nrow = m; - A->ncol = n; - A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); - if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store"); - Astore = A->Store; - Astore->nnz = nnz; - Astore->nzval = nzval; - Astore->colind = colind; - Astore->rowptr = rowptr; -} - -/* Copy matrix A into matrix B. */ -void -zCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore, *Bstore; - int ncol, nnz, i; - - B->Stype = A->Stype; - B->Dtype = A->Dtype; - B->Mtype = A->Mtype; - B->nrow = A->nrow;; - B->ncol = ncol = A->ncol; - Astore = (NCformat *) A->Store; - Bstore = (NCformat *) B->Store; - Bstore->nnz = nnz = Astore->nnz; - for (i = 0; i < nnz; ++i) - ((doublecomplex *)Bstore->nzval)[i] = ((doublecomplex *)Astore->nzval)[i]; - for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; - for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; -} - - -void -zCreate_Dense_Matrix(SuperMatrix *X, int m, int n, doublecomplex *x, int ldx, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - DNformat *Xstore; - - X->Stype = stype; - X->Dtype = dtype; - X->Mtype = mtype; - X->nrow = m; - X->ncol = n; - X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); - if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store"); - Xstore = (DNformat *) X->Store; - Xstore->lda = ldx; - Xstore->nzval = (doublecomplex *) x; -} - -void -zCopy_Dense_Matrix(int M, int N, doublecomplex *X, int ldx, - doublecomplex *Y, int ldy) -{ -/* - * - * Purpose - * ======= - * - * Copies a two-dimensional matrix X to another matrix Y. - */ - int i, j; - - for (j = 0; j < N; ++j) - for (i = 0; i < M; ++i) - Y[i + j*ldy] = X[i + j*ldx]; -} - -void -zCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, - doublecomplex *nzval, int *nzval_colptr, int *rowind, - int *rowind_colptr, int *col_to_sup, int *sup_to_col, - Stype_t stype, Dtype_t dtype, Mtype_t mtype) -{ - SCformat *Lstore; - - L->Stype = stype; - L->Dtype = dtype; - L->Mtype = mtype; - L->nrow = m; - L->ncol = n; - L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); - if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); - Lstore = L->Store; - Lstore->nnz = nnz; - Lstore->nsuper = col_to_sup[n]; - Lstore->nzval = nzval; - Lstore->nzval_colptr = nzval_colptr; - Lstore->rowind = rowind; - Lstore->rowind_colptr = rowind_colptr; - Lstore->col_to_sup = col_to_sup; - Lstore->sup_to_col = sup_to_col; - -} - - -/* - * Convert a row compressed storage into a column compressed storage. - */ -void -zCompRow_to_CompCol(int m, int n, int nnz, - doublecomplex *a, int *colind, int *rowptr, - doublecomplex **at, int **rowind, int **colptr) -{ - register int i, j, col, relpos; - int *marker; - - /* Allocate storage for another copy of the matrix. */ - *at = (doublecomplex *) doublecomplexMalloc(nnz); - *rowind = (int *) intMalloc(nnz); - *colptr = (int *) intMalloc(n+1); - marker = (int *) intCalloc(n); - - /* Get counts of each column of A, and set up column pointers */ - for (i = 0; i < m; ++i) - for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; - (*colptr)[0] = 0; - for (j = 0; j < n; ++j) { - (*colptr)[j+1] = (*colptr)[j] + marker[j]; - marker[j] = (*colptr)[j]; - } - - /* Transfer the matrix into the compressed column storage. */ - for (i = 0; i < m; ++i) { - for (j = rowptr[i]; j < rowptr[i+1]; ++j) { - col = colind[j]; - relpos = marker[col]; - (*rowind)[relpos] = i; - (*at)[relpos] = a[j]; - ++marker[col]; - } - } - - SUPERLU_FREE(marker); -} - - -void -zPrint_CompCol_Matrix(char *what, SuperMatrix *A) -{ - NCformat *Astore; - register int i,n; - double *dp; - - printf("\nCompCol matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (NCformat *) A->Store; - dp = (double *) Astore->nzval; - printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); - printf("nzval: "); - for (i = 0; i < 2*Astore->colptr[n]; ++i) printf("%f ", dp[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); - printf("\ncolptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); - printf("\n"); - fflush(stdout); -} - -void -zPrint_SuperNode_Matrix(char *what, SuperMatrix *A) -{ - SCformat *Astore; - register int i, j, k, c, d, n, nsup; - double *dp; - int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; - - printf("\nSuperNode matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - n = A->ncol; - Astore = (SCformat *) A->Store; - dp = (double *) Astore->nzval; - col_to_sup = Astore->col_to_sup; - sup_to_col = Astore->sup_to_col; - rowind_colptr = Astore->rowind_colptr; - rowind = Astore->rowind; - printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", - A->nrow,A->ncol,Astore->nnz,Astore->nsuper); - printf("nzval:\n"); - for (k = 0; k <= Astore->nsuper; ++k) { - c = sup_to_col[k]; - nsup = sup_to_col[k+1] - c; - for (j = c; j < c + nsup; ++j) { - d = Astore->nzval_colptr[j]; - for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { - printf("%d\t%d\t%e\t%e\n", rowind[i], j, dp[d], dp[d+1]); - d += 2; - } - } - } -#if 0 - for (i = 0; i < 2*Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); -#endif - printf("\nnzval_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); - printf("\nrowind: "); - for (i = 0; i < Astore->rowind_colptr[n]; ++i) - printf("%d ", Astore->rowind[i]); - printf("\nrowind_colptr: "); - for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); - printf("\ncol_to_sup: "); - for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); - printf("\nsup_to_col: "); - for (i = 0; i <= Astore->nsuper+1; ++i) - printf("%d ", sup_to_col[i]); - printf("\n"); - fflush(stdout); -} - -void -zPrint_Dense_Matrix(char *what, SuperMatrix *A) -{ - DNformat *Astore; - register int i, j, lda = Astore->lda; - double *dp; - - printf("\nDense matrix %s:\n", what); - printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); - Astore = (DNformat *) A->Store; - dp = (double *) Astore->nzval; - printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda); - printf("\nnzval: "); - for (j = 0; j < A->ncol; ++j) { - for (i = 0; i < 2*A->nrow; ++i) printf("%f ", dp[i + j*2*lda]); - printf("\n"); - } - printf("\n"); - fflush(stdout); -} - -/* - * Diagnostic print of column "jcol" in the U/L factor. - */ -void -zprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) -{ - int i, k, fsupc; - int *xsup, *supno; - int *xlsub, *lsub; - doublecomplex *lusup; - int *xlusup; - doublecomplex *ucol; - int *usub, *xusub; - - xsup = Glu->xsup; - supno = Glu->supno; - lsub = Glu->lsub; - xlsub = Glu->xlsub; - lusup = Glu->lusup; - xlusup = Glu->xlusup; - ucol = Glu->ucol; - usub = Glu->usub; - xusub = Glu->xusub; - - printf("%s", msg); - printf("col %d: pivrow %d, supno %d, xprune %d\n", - jcol, pivrow, supno[jcol], xprune[jcol]); - - printf("\tU-col:\n"); - for (i = xusub[jcol]; i < xusub[jcol+1]; i++) - printf("\t%d%10.4f, %10.4f\n", usub[i], ucol[i].r, ucol[i].i); - printf("\tL-col in rectangular snode:\n"); - fsupc = xsup[supno[jcol]]; /* first col of the snode */ - i = xlsub[fsupc]; - k = xlusup[jcol]; - while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { - printf("\t%d\t%10.4f, %10.4f\n", lsub[i], lusup[k].r, lusup[k].i); - i++; k++; - } - fflush(stdout); -} - - -/* - * Check whether tempv[] == 0. This should be true before and after - * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". - */ -void zcheck_tempv(int n, doublecomplex *tempv) -{ - int i; - - for (i = 0; i < n; i++) { - if ((tempv[i].r != 0.0) || (tempv[i].i != 0.0)) - { - fprintf(stderr,"tempv[%d] = {%f, %f}\n", i, tempv[i].r, tempv[i].i); - ABORT("zcheck_tempv"); - } - } -} - - -void -zGenXtrue(int n, int nrhs, doublecomplex *x, int ldx) -{ - int i, j; - for (j = 0; j < nrhs; ++j) - for (i = 0; i < n; ++i) { - x[i + j*ldx].r = 1.0; - x[i + j*ldx].i = 0.0; - } -} - -/* - * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's - */ -void -zFillRHS(trans_t trans, int nrhs, doublecomplex *x, int ldx, - SuperMatrix *A, SuperMatrix *B) -{ - NCformat *Astore; - doublecomplex *Aval; - DNformat *Bstore; - doublecomplex *rhs; - doublecomplex one = {1.0, 0.0}; - doublecomplex zero = {0.0, 0.0}; - int ldc; - char transc[1]; - - Astore = A->Store; - Aval = (doublecomplex *) Astore->nzval; - Bstore = B->Store; - rhs = Bstore->nzval; - ldc = Bstore->lda; - - if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; - else *(unsigned char *)transc = 'T'; - - sp_zgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A, - x, ldx, zero, rhs, ldc); - -} - -/* - * Fills a doublecomplex precision array with a given value. - */ -void -zfill(doublecomplex *a, int alen, doublecomplex dval) -{ - register int i; - for (i = 0; i < alen; i++) a[i] = dval; -} - - - -/* - * Check the inf-norm of the error vector - */ -void zinf_norm_error(int nrhs, SuperMatrix *X, doublecomplex *xtrue) -{ - DNformat *Xstore; - double err, xnorm; - doublecomplex *Xmat, *soln_work; - doublecomplex temp; - int i, j; - - Xstore = X->Store; - Xmat = Xstore->nzval; - - for (j = 0; j < nrhs; j++) { - soln_work = &Xmat[j*Xstore->lda]; - err = xnorm = 0.0; - for (i = 0; i < X->nrow; i++) { - z_sub(&temp, &soln_work[i], &xtrue[i]); - err = SUPERLU_MAX(err, z_abs(&temp)); - xnorm = SUPERLU_MAX(xnorm, z_abs(&soln_work[i])); - } - err = err / xnorm; - printf("||X - Xtrue||/||X|| = %e\n", err); - } -} - - - -/* Print performance of the code. */ -void -zPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, - double rpg, double rcond, double *ferr, - double *berr, char *equed, SuperLUStat_t *stat) -{ - SCformat *Lstore; - NCformat *Ustore; - double *utime; - flops_t *ops; - - utime = stat->utime; - ops = stat->ops; - - if ( utime[FACT] != 0. ) - printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], - ops[FACT]*1e-6/utime[FACT]); - printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); - if ( utime[SOLVE] != 0. ) - printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], - ops[SOLVE]*1e-6/utime[SOLVE]); - - Lstore = (SCformat *) L->Store; - Ustore = (NCformat *) U->Store; - printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); - printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); - printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); - - printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n", - mem_usage->for_lu/1e6, mem_usage->total_needed/1e6, - mem_usage->expansions); - - printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); - printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", - utime[FACT], ops[FACT]*1e-6/utime[FACT], - utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], - utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); - - printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); - printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", - rpg, rcond, ferr[0], berr[0], equed); - -} - - - - -print_doublecomplex_vec(char *what, int n, doublecomplex *vec) -{ - int i; - printf("%s: n %d\n", what, n); - for (i = 0; i < n; ++i) printf("%d\t%f%f\n", i, vec[i].r, vec[i].i); - return 0; -} - diff -Nru hypre-2.11.2/src/IJ_mv/IJVector_parcsr.c hypre-2.13.0/src/IJ_mv/IJVector_parcsr.c --- hypre-2.11.2/src/IJ_mv/IJVector_parcsr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/IJ_mv/IJVector_parcsr.c 2017-10-20 17:42:22.000000000 +0000 @@ -858,7 +858,7 @@ HYPRE_Complex *off_proc_data) { MPI_Comm comm = hypre_IJVectorComm(vector); - hypre_ParVector *par_vector = hypre_IJVectorObject(vector); + hypre_ParVector *par_vector = ( hypre_ParVector *) hypre_IJVectorObject(vector); hypre_MPI_Request *requests = NULL; hypre_MPI_Status *status = NULL; HYPRE_Int i, j, j2, row; @@ -1315,7 +1315,8 @@ ex_contact_vec_starts[i+1] = -storage-1; /* need negative for next loop */ } - void_contact_buf = hypre_MAlloc(storage*obj_size_bytes); + /*void_contact_buf = hypre_MAlloc(storage*obj_size_bytes);*/ + void_contact_buf = hypre_CAlloc(storage,obj_size_bytes); index_ptr = void_contact_buf; /* step through with this index */ /* set up data to be sent to send procs */ diff -Nru hypre-2.11.2/src/krylov/bicgstab.c hypre-2.13.0/src/krylov/bicgstab.c --- hypre-2.11.2/src/krylov/bicgstab.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/bicgstab.c 2017-10-20 17:42:22.000000000 +0000 @@ -19,7 +19,6 @@ #include "krylov.h" #include "_hypre_utilities.h" - /*-------------------------------------------------------------------------- * hypre_BiCGSTABFunctionsCreate *--------------------------------------------------------------------------*/ @@ -236,7 +235,7 @@ HYPRE_Int iter; HYPRE_Int my_id, num_procs; HYPRE_Real alpha, beta, gamma, epsilon, temp, res, r_norm, b_norm; - HYPRE_Real epsmac = 1.e-128; + HYPRE_Real epsmac = HYPRE_REAL_MIN; HYPRE_Real ieee_check = 0.; HYPRE_Real cf_ave_0 = 0.0; HYPRE_Real cf_ave_1 = 0.0; diff -Nru hypre-2.11.2/src/krylov/gmres.c hypre-2.13.0/src/krylov/gmres.c --- hypre-2.11.2/src/krylov/gmres.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/gmres.c 2017-10-20 17:42:22.000000000 +0000 @@ -833,7 +833,6 @@ (gmres_data -> rel_residual_norm) = r_norm; if (iter >= max_iter && r_norm > epsilon) hypre_error(HYPRE_ERROR_CONV); - hypre_TFreeF(c,gmres_functions); hypre_TFreeF(s,gmres_functions); diff -Nru hypre-2.11.2/src/krylov/HYPRE_lobpcg.c hypre-2.13.0/src/krylov/HYPRE_lobpcg.c --- hypre-2.11.2/src/krylov/HYPRE_lobpcg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/HYPRE_lobpcg.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,7 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - /****************************************************************************** * * HYPRE_LOBPCG interface @@ -20,29 +19,6 @@ #include "_hypre_utilities.h" #include "HYPRE_config.h" -#ifdef HYPRE_USING_ESSL - -#include - -#else - -#include "fortran.h" -#ifdef __cplusplus -extern "C" { -#endif - -HYPRE_Int hypre_F90_NAME_LAPACK(dsygv, DSYGV) - ( HYPRE_Int *itype, char *jobz, char *uplo, HYPRE_Int *n, - HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Real *b, HYPRE_Int *ldb, HYPRE_Real *w, - HYPRE_Real *work, HYPRE_Int *lwork, /*@out@*/ HYPRE_Int *info - ); -HYPRE_Int hypre_F90_NAME_LAPACK( dpotrf, DPOTRF ) - (const char* uplo, HYPRE_Int* n, HYPRE_Real* aval, HYPRE_Int* lda, HYPRE_Int* ierr ); -#ifdef __cplusplus -} -#endif - -#endif #include "HYPRE_lobpcg.h" #include "lobpcg.h" @@ -50,6 +26,8 @@ #include "interpreter.h" #include "HYPRE_MatvecFunctions.h" +#include "_hypre_lapack.h" + typedef struct { HYPRE_Int (*Precond)(void*,void*,void*,void*); @@ -107,24 +85,14 @@ n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Real *b, HYPRE_Int *ldb, HYPRE_Real *w, HYPRE_Real *work, HYPRE_Int *lwork, HYPRE_Int *info) { -#ifdef HYPRE_USING_ESSL - dsygv(*itype, a, *lda, b, *ldb, w, a, *lda, *n, work, *lwork ); -#else - hypre_F90_NAME_LAPACK( dsygv, DSYGV )( itype, jobz, uplo, n, - a, lda, b, ldb, - w, work, lwork, info ); -#endif + hypre_dsygv(itype, jobz, uplo, n, a, lda, b, ldb, w, work, lwork, info); return 0; } -static HYPRE_Int dpotrf_interface (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int * +static HYPRE_Int dpotrf_interface (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int * lda, HYPRE_Int *info) { -#ifdef HYPRE_USING_ESSL - dpotrf(uplo, *n, a, *lda, info); -#else - hypre_F90_NAME_LAPACK( dpotrf, DPOTRF )(uplo, n, a, lda, info); -#endif + hypre_dpotrf(uplo, n, a, lda, info); return 0; } diff -Nru hypre-2.11.2/src/krylov/lobpcg.c hypre-2.13.0/src/krylov/lobpcg.c --- hypre-2.11.2/src/krylov/lobpcg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/lobpcg.c 2017-10-20 17:42:22.000000000 +0000 @@ -29,7 +29,7 @@ static HYPRE_Int lobpcg_chol( utilities_FortranMatrix* a, - HYPRE_Int (*dpotrf) (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info) ) + HYPRE_Int (*dpotrf) (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info) ) { HYPRE_Int lda, n; @@ -121,7 +121,7 @@ mv_MultiVectorPtr x, mv_MultiVectorPtr y, utilities_FortranMatrix* r, mv_MultiVectorPtr z, -HYPRE_Int (*dpotrf) (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info) +HYPRE_Int (*dpotrf) (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info) ){ @@ -176,7 +176,7 @@ for ( i = 0; i < n; i++ ) { if ( utilities_FortranMatrixValue( resNorms, i + 1, 1 ) > utilities_FortranMatrixValue( lambda, i + 1, 1 )*rtol + atol - + DBL_EPSILON ) { + + HYPRE_REAL_EPSILON ) { activeMask[i] = 1; notConverged++; } diff -Nru hypre-2.11.2/src/krylov/lobpcg.h hypre-2.13.0/src/krylov/lobpcg.h --- hypre-2.11.2/src/krylov/lobpcg.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/lobpcg.h 2017-10-20 17:42:22.000000000 +0000 @@ -15,6 +15,10 @@ #ifndef LOCALLY_OPTIMAL_BLOCK_PRECONDITIONED_CONJUGATE_GRADIENTS #define LOCALLY_OPTIMAL_BLOCK_PRECONDITIONED_CONJUGATE_GRADIENTS +#ifdef __cplusplus +extern "C" { +#endif + #define PROBLEM_SIZE_TOO_SMALL 1 #define WRONG_BLOCK_SIZE 2 #define WRONG_CONSTRAINTS 3 @@ -30,7 +34,7 @@ typedef struct { /* these pointers should point to 2 functions providing standard lapack functionality */ - HYPRE_Int (*dpotrf) (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int * + HYPRE_Int (*dpotrf) (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int * lda, HYPRE_Int *info); HYPRE_Int (*dsygv) (HYPRE_Int *itype, char *jobz, char *uplo, HYPRE_Int * n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Real *b, HYPRE_Int *ldb, @@ -38,10 +42,6 @@ } lobpcg_BLASLAPACKFunctions; -#ifdef __cplusplus -extern "C" { -#endif - HYPRE_Int lobpcg_solve( mv_MultiVectorPtr blockVectorX, void* operatorAData, diff -Nru hypre-2.11.2/src/krylov/Makefile hypre-2.13.0/src/krylov/Makefile --- hypre-2.11.2/src/krylov/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -18,6 +18,8 @@ C_COMPILE_FLAGS = \ -I..\ -I$(srcdir)/..\ + -I$(srcdir)/../blas\ + -I$(srcdir)/../lapack\ -I$(srcdir)/../multivector\ -I$(srcdir)/../utilities\ ${CINCLUDES} diff -Nru hypre-2.11.2/src/krylov/pcg.c hypre-2.13.0/src/krylov/pcg.c --- hypre-2.11.2/src/krylov/pcg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/krylov/pcg.c 2017-10-20 17:42:22.000000000 +0000 @@ -645,7 +645,7 @@ break; } - if ( (gamma<1.0e-292) && ((-gamma)<1.0e-292) ) { + if ( (gamma 0.0) { cf_ave_0 = cf_ave_1; - if ( i_prod_0<1.0e-292 ) { + if ( i_prod_0 2e3) { @@ -55,3 +58,6 @@ } /* dlabad_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlabrd.c hypre-2.13.0/src/lapack/dlabrd.c --- hypre-2.11.2/src/lapack/dlabrd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlabrd.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlabrd_(integer *m, integer *n, integer *nb, doublereal * +/* Subroutine */ integer dlabrd_(integer *m, integer *n, integer *nb, doublereal * a, integer *lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal *taup, doublereal *x, integer *ldx, doublereal *y, integer *ldy) @@ -149,7 +152,7 @@ i__3; /* Local variables */ static integer i__; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *), dgemv_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlarfg_(integer *, doublereal *, @@ -410,4 +413,6 @@ #undef x_ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlacpy.c hypre-2.13.0/src/lapack/dlacpy.c --- hypre-2.11.2/src/lapack/dlacpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlacpy.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlacpy_(const char *uplo, integer *m, integer *n, doublereal * +/* Subroutine */ integer dlacpy_(const char *uplo, integer *m, integer *n, doublereal * a, integer *lda, doublereal *b, integer *ldb) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -106,4 +109,6 @@ #undef b_ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlae2.c hypre-2.13.0/src/lapack/dlae2.c --- hypre-2.11.2/src/lapack/dlae2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlae2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlae2_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ integer dlae2_(doublereal *a, doublereal *b, doublereal *c__, doublereal *rt1, doublereal *rt2) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -58,8 +61,6 @@ Compute the eigenvalues */ /* System generated locals */ doublereal d__1; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); /* Local variables */ static doublereal acmn, acmx, ab, df, tb, sm, rt, adf; @@ -119,3 +120,6 @@ } /* dlae2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlaev2.c hypre-2.13.0/src/lapack/dlaev2.c --- hypre-2.11.2/src/lapack/dlaev2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlaev2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlaev2_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ integer dlaev2_(doublereal *a, doublereal *b, doublereal *c__, doublereal *rt1, doublereal *rt2, doublereal *cs1, doublereal *sn1) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -69,8 +72,6 @@ Compute the eigenvalues */ /* System generated locals */ doublereal d__1; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); /* Local variables */ static doublereal acmn, acmx, ab, df, cs, ct, tb, sm, tn, rt, adf, acs; static integer sgn1, sgn2; @@ -164,3 +165,6 @@ } /* dlaev2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlamch.c hypre-2.13.0/src/lapack/dlamch.c --- hypre-2.11.2/src/lapack/dlamch.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlamch.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" doublereal dlamch_(const char *cmach) { @@ -14,7 +16,7 @@ Purpose ======= - DLAMCH determines HYPRE_Real precision machine parameters. + DLAMCH determines doublereal precision machine parameters. Arguments ========= @@ -54,7 +56,7 @@ integer i__1; doublereal ret_val; /* Builtin functions */ - HYPRE_Real pow_di(doublereal *, integer *); + doublereal pow_di(doublereal *, integer *); /* Local variables */ static doublereal base; static integer beta; @@ -64,7 +66,7 @@ static doublereal rmin, rmax, t, rmach; extern logical lsame_(const char *,const char *); static doublereal small, sfmin; - extern /* Subroutine */ HYPRE_Int dlamc2_(integer *, integer *, logical *, + extern /* Subroutine */ integer dlamc2_(integer *, integer *, logical *, doublereal *, integer *, doublereal *, integer *, doublereal *); static integer it; static doublereal rnd, eps; @@ -129,9 +131,7 @@ } /* dlamch_ */ -#include "f2c.h" - -/* Subroutine */ HYPRE_Int dlamc1_(integer *beta, integer *t, logical *rnd, logical +/* Subroutine */ integer dlamc1_(integer *beta, integer *t, logical *rnd, logical *ieee1) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -252,7 +252,7 @@ /* + END WHILE Now compute the base. a and c are neighbouring floating po -HYPRE_Int +integer numbers in the interval ( beta**t, beta**( t + 1 ) ) and so their difference is beta. Adding 0.25 to c is to ensure that @@ -342,9 +342,7 @@ } /* dlamc1_ */ -#include "f2c.h" - -/* Subroutine */ HYPRE_Int dlamc2_(integer *beta, integer *t, logical *rnd, +/* Subroutine */ integer dlamc2_(integer *beta, integer *t, logical *rnd, doublereal *eps, integer *emin, doublereal *rmin, integer *emax, doublereal *rmax) { @@ -420,7 +418,7 @@ integer i__1; doublereal d__1, d__2, d__3, d__4, d__5; /* Builtin functions */ - HYPRE_Real pow_di(doublereal *, integer *); + doublereal pow_di(doublereal *, integer *); /* Local variables */ static logical ieee; static doublereal half; @@ -432,11 +430,11 @@ static doublereal small; static integer gpmin; static doublereal third, lrmin, lrmax, sixth; - extern /* Subroutine */ HYPRE_Int dlamc1_(integer *, integer *, logical *, + extern /* Subroutine */ integer dlamc1_(integer *, integer *, logical *, logical *); extern doublereal dlamc3_(doublereal *, doublereal *); static logical lieee1; - extern /* Subroutine */ HYPRE_Int dlamc4_(integer *, doublereal *, integer *), + extern /* Subroutine */ integer dlamc4_(integer *, doublereal *, integer *), dlamc5_(integer *, integer *, integer *, logical *, integer *, doublereal *); static integer lt, ngnmin, ngpmin; @@ -594,7 +592,7 @@ if (iwarn) { first = TRUE_; hypre_printf("\n\n WARNING. The value EMIN may be incorrect:- "); - hypre_printf("EMIN = %8i\n",(HYPRE_Int)lemin); + hypre_printf("EMIN = %8i\n",(integer)lemin); hypre_printf("If, after inspection, the value EMIN looks acceptable"); hypre_printf("please comment out \n the IF block as marked within the"); hypre_printf("code of routine DLAMC2, \n otherwise supply EMIN"); @@ -647,8 +645,6 @@ } /* dlamc2_ */ -#include "f2c.h" - doublereal dlamc3_(doublereal *a, doublereal *b) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -688,9 +684,7 @@ } /* dlamc3_ */ -#include "f2c.h" - -/* Subroutine */ HYPRE_Int dlamc4_(integer *emin, doublereal *start, integer *base) +/* Subroutine */ integer dlamc4_(integer *emin, doublereal *start, integer *base) { /* -- LAPACK auxiliary routine (version 2.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., @@ -779,9 +773,7 @@ } /* dlamc4_ */ -#include "f2c.h" - -/* Subroutine */ HYPRE_Int dlamc5_(integer *beta, integer *p, integer *emin, +/* Subroutine */ integer dlamc5_(integer *beta, integer *p, integer *emin, logical *ieee, integer *emax, doublereal *rmax) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -954,3 +946,6 @@ } /* dlamc5_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlange.c hypre-2.13.0/src/lapack/dlange.c --- hypre-2.11.2/src/lapack/dlange.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlange.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,6 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" doublereal dlange_(const char *norm, integer *m, integer *n, doublereal *a, integer *lda, doublereal *work) @@ -72,13 +75,13 @@ integer a_dim1, a_offset, i__1, i__2; doublereal ret_val, d__1, d__2, d__3; /* Builtin functions */ - /*HYPRE_Real sqrt(doublereal);*/ + /*doublereal sqrt(doublereal);*/ /* Local variables */ static integer i__, j; static doublereal scale; extern logical lsame_(const char *,const char *); static doublereal value; - extern /* Subroutine */ HYPRE_Int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ integer dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); static doublereal sum; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -174,4 +177,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlanst.c hypre-2.13.0/src/lapack/dlanst.c --- hypre-2.11.2/src/lapack/dlanst.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlanst.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,6 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" doublereal dlanst_(const char *norm, integer *n, doublereal *d__, doublereal *e) { @@ -62,14 +65,12 @@ /* System generated locals */ integer i__1; doublereal ret_val, d__1, d__2, d__3, d__4, d__5; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); /* Local variables */ static integer i__; static doublereal scale; extern logical lsame_(const char *,const char *); static doublereal anorm; - extern /* Subroutine */ HYPRE_Int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ integer dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); static doublereal sum; @@ -138,3 +139,6 @@ } /* dlanst_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlansy.c hypre-2.13.0/src/lapack/dlansy.c --- hypre-2.11.2/src/lapack/dlansy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlansy.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" -#include "math.h" +#include "hypre_lapack.h" doublereal dlansy_(const char *norm,const char *uplo, integer *n, doublereal *a, integer *lda, doublereal *work) @@ -81,14 +83,14 @@ integer a_dim1, a_offset, i__1, i__2; doublereal ret_val, d__1, d__2, d__3; /* Builtin functions */ - /*HYPRE_Real sqrt(doublereal);*/ + /*doublereal sqrt(doublereal);*/ /* Local variables */ static doublereal absa; static integer i__, j; static doublereal scale; extern logical lsame_(const char *,const char *); static doublereal value; - extern /* Subroutine */ HYPRE_Int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ integer dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); static doublereal sum; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -214,4 +216,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlapy2.c hypre-2.13.0/src/lapack/dlapy2.c --- hypre-2.11.2/src/lapack/dlapy2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlapy2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" -#include "math.h" +#include "hypre_lapack.h" doublereal dlapy2_(doublereal *x, doublereal *y) { @@ -28,7 +30,7 @@ /* System generated locals */ doublereal ret_val, d__1; /* Builtin functions */ - /*HYPRE_Real sqrt(doublereal);*/ + /*doublereal sqrt(doublereal);*/ /* Local variables */ static doublereal xabs, yabs, w, z__; @@ -51,3 +53,6 @@ } /* dlapy2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlarfb.c hypre-2.13.0/src/lapack/dlarfb.c --- hypre-2.11.2/src/lapack/dlarfb.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlarfb.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,9 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlarfb_(const char *side,const char *trans,const char *direct,const char * +/* Subroutine */ integer dlarfb_(const char *side,const char *trans,const char *direct,const char * storev, integer *m, integer *n, integer *k, doublereal *v, integer * ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *ldwork) @@ -102,11 +104,11 @@ work_offset, i__1, i__2; /* Local variables */ static integer i__, j; - extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ integer dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrmm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -704,4 +706,6 @@ #undef c___ref #undef work_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlarf.c hypre-2.13.0/src/lapack/dlarf.c --- hypre-2.11.2/src/lapack/dlarf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlarf.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlarf_(const char *side, integer *m, integer *n, doublereal *v, +/* Subroutine */ integer dlarf_(const char *side, integer *m, integer *n, doublereal *v, integer *incv, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work) { @@ -74,11 +77,11 @@ integer c_dim1, c_offset; doublereal d__1; /* Local variables */ - extern /* Subroutine */ HYPRE_Int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ integer dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, + extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -131,3 +134,6 @@ } /* dlarf_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlarfg.c hypre-2.13.0/src/lapack/dlarfg.c --- hypre-2.11.2/src/lapack/dlarfg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlarfg.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlarfg_(integer *n, doublereal *alpha, doublereal *x, +/* Subroutine */ integer dlarfg_(integer *n, doublereal *alpha, doublereal *x, integer *incx, doublereal *tau) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -63,12 +66,12 @@ integer i__1; doublereal d__1; /* Builtin functions */ - HYPRE_Real d_sign(doublereal *, doublereal *); + doublereal d_sign(doublereal *, doublereal *); /* Local variables */ static doublereal beta; extern doublereal dnrm2_(integer *, doublereal *, integer *); static integer j; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *); static doublereal xnorm; extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(const char *); @@ -148,3 +151,6 @@ } /* dlarfg_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlarft.c hypre-2.13.0/src/lapack/dlarft.c --- hypre-2.11.2/src/lapack/dlarft.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlarft.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,9 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlarft_(const char *direct,const char *storev, integer *n, integer * +/* Subroutine */ integer dlarft_(const char *direct,const char *storev, integer *n, integer * k, doublereal *v, integer *ldv, doublereal *tau, doublereal *t, integer *ldt) { @@ -117,7 +119,7 @@ /* Local variables */ static integer i__, j; extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, + extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dtrmv_(const char *, const char *,const char *, integer *, doublereal *, integer *, doublereal *, @@ -257,4 +259,6 @@ #undef v_ref #undef t_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlartg.c hypre-2.13.0/src/lapack/dlartg.c --- hypre-2.11.2/src/lapack/dlartg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlartg.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" -#include "math.h" -/* Subroutine */ HYPRE_Int dlartg_(doublereal *f, doublereal *g, doublereal *cs, +#include "hypre_lapack.h" + +/* Subroutine */ integer dlartg_(doublereal *f, doublereal *g, doublereal *cs, doublereal *sn, doublereal *r__) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -54,24 +57,26 @@ integer i__1; doublereal d__1, d__2; /* Builtin functions */ - //HYPRE_Real log(doublereal), pow_di(doublereal *, integer *), sqrt(doublereal); +// doublereal log(doublereal), pow_di(doublereal *, integer *), sqrt(doublereal); + doublereal pow_di(doublereal *, integer *); /* Local variables */ static integer i__; static doublereal scale; static integer count; static doublereal f1, g1, safmn2, safmx2; extern doublereal dlamch_(const char *); - static doublereal safmin, eps; +// static doublereal safmin, eps; if (first) { first = FALSE_; - safmin = dlamch_("S"); - eps = dlamch_("E"); +// safmin = dlamch_("S"); +// eps = dlamch_("E"); d__1 = dlamch_("B"); - i__1 = (integer) (log(safmin / eps) / log(dlamch_("B")) / - 2.); +// i__1 = (integer) (log(safmin / eps) / log(dlamch_("B")) / +// 2.); + i__1 = HYPRE_REAL_MIN_EXP>>1; safmn2 = pow_di(&d__1, &i__1); safmx2 = 1. / safmn2; } @@ -158,3 +163,6 @@ } /* dlartg_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlas2.c hypre-2.13.0/src/lapack/dlas2.c --- hypre-2.11.2/src/lapack/dlas2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlas2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlas2_(doublereal *f, doublereal *g, doublereal *h__, +/* Subroutine */ integer dlas2_(doublereal *f, doublereal *g, doublereal *h__, doublereal *ssmin, doublereal *ssmax) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -60,8 +63,6 @@ ==================================================================== */ /* System generated locals */ doublereal d__1, d__2; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); /* Local variables */ static doublereal fhmn, fhmx, c__, fa, ga, ha, as, at, au; @@ -121,3 +122,6 @@ } /* dlas2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlascl.c hypre-2.13.0/src/lapack/dlascl.c --- hypre-2.11.2/src/lapack/dlascl.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlascl.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlascl_(const char *type__, integer *kl, integer *ku, +/* Subroutine */ integer dlascl_(const char *type__, integer *kl, integer *ku, doublereal *cfrom, doublereal *cto, integer *m, integer *n, doublereal *a, integer *lda, integer *info) { @@ -88,7 +91,7 @@ static doublereal cfrom1; extern doublereal dlamch_(const char *); static doublereal cfromc; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static doublereal bignum, smlnum, mul, cto1; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -313,4 +316,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlaset.c hypre-2.13.0/src/lapack/dlaset.c --- hypre-2.11.2/src/lapack/dlaset.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlaset.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlaset_(const char *uplo, integer *m, integer *n, doublereal * +/* Subroutine */ integer dlaset_(const char *uplo, integer *m, integer *n, doublereal * alpha, doublereal *beta, doublereal *a, integer *lda) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -131,4 +134,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasq1.c hypre-2.13.0/src/lapack/dlasq1.c --- hypre-2.11.2/src/lapack/dlasq1.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasq1.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,6 +1,9 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" /* -- translated by f2c (version 19990503). You must link the resulting object file with the libraries: @@ -13,33 +16,30 @@ static integer c__2 = 2; static integer c__0 = 0; -/* Subroutine */ HYPRE_Int dlasq1_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ integer dlasq1_(integer *n, doublereal *d__, doublereal *e, doublereal *work, integer *info) { /* System generated locals */ integer i__1, i__2; doublereal d__1, d__2, d__3; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); - /* Local variables */ - extern /* Subroutine */ HYPRE_Int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ integer dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); static integer i__; static doublereal scale; static integer iinfo; static doublereal sigmn; - extern /* Subroutine */ HYPRE_Int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ integer dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); static doublereal sigmx; - extern /* Subroutine */ HYPRE_Int dlasq2_(integer *, doublereal *, integer *); + extern /* Subroutine */ integer dlasq2_(integer *, doublereal *, integer *); extern doublereal dlamch_(const char *); - extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); static doublereal safmin; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *), dlasrt_( + extern /* Subroutine */ integer xerbla_(const char *, integer *), dlasrt_( const char *, integer *, doublereal *, integer *); static doublereal eps; @@ -192,3 +192,6 @@ } /* dlasq1_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasq2.c hypre-2.13.0/src/lapack/dlasq2.c --- hypre-2.11.2/src/lapack/dlasq2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasq2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,12 +1,15 @@ +#ifdef __cplusplus +extern "C" { +#endif +#include "f2c.h" #include "hypre_lapack.h" + /* -- translated by f2c (version 19990503). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ -#include "f2c.h" - /* Table of constant values */ static integer c__1 = 1; @@ -16,15 +19,12 @@ static integer c__4 = 4; static integer c__11 = 11; -/* Subroutine */ HYPRE_Int dlasq2_(integer *n, doublereal *z__, integer *info) +/* Subroutine */ integer dlasq2_(integer *n, doublereal *z__, integer *info) { /* System generated locals */ integer i__1, i__2, i__3; doublereal d__1, d__2; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); - /* Local variables */ static logical ieee; static integer nbig; @@ -38,16 +38,16 @@ static integer nfail; static doublereal desig, trace, sigma; static integer iinfo, i0, i4, n0; - extern /* Subroutine */ HYPRE_Int dlasq3_(integer *, integer *, doublereal *, + extern /* Subroutine */ integer dlasq3_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, logical *); extern doublereal dlamch_(const char *); static integer pp, iwhila, iwhilb; static doublereal oldemn, safmin; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ HYPRE_Int dlasrt_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dlasrt_(const char *, integer *, doublereal *, integer *); static doublereal eps, tol; static integer ipn4; @@ -522,3 +522,6 @@ } /* dlasq2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasq3.c hypre-2.13.0/src/lapack/dlasq3.c --- hypre-2.11.2/src/lapack/dlasq3.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasq3.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlasq3_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ integer dlasq3_(integer *i0, integer *n0, doublereal *z__, integer *pp, doublereal *dmin__, doublereal *sigma, doublereal *desig, doublereal *qmax, integer *nfail, integer *iter, integer *ndiv, logical *ieee) @@ -76,12 +79,10 @@ /* System generated locals */ integer i__1; doublereal d__1, d__2; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); /* Local variables */ static doublereal temp, s, t; static integer j4; - extern /* Subroutine */ HYPRE_Int dlasq4_(integer *, integer *, doublereal *, + extern /* Subroutine */ integer dlasq4_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *) , dlasq5_(integer *, integer *, doublereal *, integer *, @@ -316,3 +317,6 @@ } /* dlasq3_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasq4.c hypre-2.13.0/src/lapack/dlasq4.c --- hypre-2.11.2/src/lapack/dlasq4.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasq4.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,13 +1,16 @@ +#ifdef __cplusplus +extern "C" { +#endif +#include "f2c.h" #include "hypre_lapack.h" + /* -- translated by f2c (version 19990503). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ -#include "f2c.h" - -/* Subroutine */ HYPRE_Int dlasq4_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ integer dlasq4_(integer *i0, integer *n0, doublereal *z__, integer *pp, integer *n0in, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2, doublereal *dn, doublereal *dn1, doublereal *dn2, doublereal *tau, integer *ttype) @@ -20,9 +23,6 @@ integer i__1; doublereal d__1, d__2; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); - /* Local variables */ static doublereal s, a2, b1, b2; static integer i4, nn, np; @@ -380,3 +380,6 @@ } /* dlasq4_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasq5.c hypre-2.13.0/src/lapack/dlasq5.c --- hypre-2.11.2/src/lapack/dlasq5.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasq5.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlasq5_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ integer dlasq5_(integer *i0, integer *n0, doublereal *z__, integer *pp, doublereal *tau, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2, doublereal *dn, doublereal *dnm1, doublereal *dnm2, logical *ieee) @@ -209,3 +212,6 @@ } /* dlasq5_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasq6.c hypre-2.13.0/src/lapack/dlasq6.c --- hypre-2.11.2/src/lapack/dlasq6.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasq6.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,13 +1,16 @@ +#ifdef __cplusplus +extern "C" { +#endif +#include "f2c.h" #include "hypre_lapack.h" + /* -- translated by f2c (version 19990503). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ -#include "f2c.h" - -/* Subroutine */ HYPRE_Int dlasq6_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ integer dlasq6_(integer *i0, integer *n0, doublereal *z__, integer *pp, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2, doublereal *dn, doublereal *dnm1, doublereal *dnm2) { @@ -187,3 +190,6 @@ } /* dlasq6_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasr.c hypre-2.13.0/src/lapack/dlasr.c --- hypre-2.11.2/src/lapack/dlasr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlasr_(const char *side,const char *pivot,const char *direct, integer *m, +/* Subroutine */ integer dlasr_(const char *side,const char *pivot,const char *direct, integer *m, integer *n, doublereal *c__, doublereal *s, doublereal *a, integer * lda) { @@ -114,7 +117,7 @@ static integer i__, j; extern logical lsame_(const char *,const char *); static doublereal ctemp, stemp; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] --c__; @@ -392,4 +395,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasrt.c hypre-2.13.0/src/lapack/dlasrt.c --- hypre-2.11.2/src/lapack/dlasrt.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasrt.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlasrt_(const char *id, integer *n, doublereal *d__, integer * +/* Subroutine */ integer dlasrt_(const char *id, integer *n, doublereal *d__, integer * info) { /* -- LAPACK routine (version 3.0) -- @@ -54,7 +57,7 @@ static integer stack[64] /* was [2][32] */; static doublereal dmnmx, d1, d2, d3; static integer start; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static integer stkpnt, dir; static doublereal tmp; #define stack_ref(a_1,a_2) stack[(a_2)*2 + a_1 - 3] @@ -258,4 +261,6 @@ #undef stack_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlassq.c hypre-2.13.0/src/lapack/dlassq.c --- hypre-2.11.2/src/lapack/dlassq.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlassq.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlassq_(integer *n, doublereal *x, integer *incx, +/* Subroutine */ integer dlassq_(integer *n, doublereal *x, integer *incx, doublereal *scale, doublereal *sumsq) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -92,3 +95,6 @@ } /* dlassq_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlasv2.c hypre-2.13.0/src/lapack/dlasv2.c --- hypre-2.11.2/src/lapack/dlasv2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlasv2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlasv2_(doublereal *f, doublereal *g, doublereal *h__, +/* Subroutine */ integer dlasv2_(doublereal *f, doublereal *g, doublereal *h__, doublereal *ssmin, doublereal *ssmax, doublereal *snr, doublereal * csr, doublereal *snl, doublereal *csl) { @@ -83,7 +86,7 @@ /* System generated locals */ doublereal d__1; /* Builtin functions */ - HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *); + doublereal d_sign(doublereal *, doublereal *); /* Local variables */ static integer pmax; static doublereal temp; @@ -249,3 +252,6 @@ } /* dlasv2_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlaswp.c hypre-2.13.0/src/lapack/dlaswp.c --- hypre-2.11.2/src/lapack/dlaswp.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlaswp.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlaswp_(integer *n, doublereal *a, integer *lda, integer +/* Subroutine */ integer dlaswp_(integer *n, doublereal *a, integer *lda, integer *k1, integer *k2, integer *ipiv, integer *incx) { /* -- LAPACK auxiliary routine (version 3.0) -- @@ -141,4 +144,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dlatrd.c hypre-2.13.0/src/lapack/dlatrd.c --- hypre-2.11.2/src/lapack/dlatrd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dlatrd.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dlatrd_(const char *uplo, integer *n, integer *nb, doublereal * +/* Subroutine */ integer dlatrd_(const char *uplo, integer *n, integer *nb, doublereal * a, integer *lda, doublereal *e, doublereal *tau, doublereal *w, integer *ldw) { @@ -152,10 +155,10 @@ integer *); static integer i__; static doublereal alpha; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, + extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), @@ -330,4 +333,6 @@ #undef w_ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorg2l.c hypre-2.13.0/src/lapack/dorg2l.c --- hypre-2.11.2/src/lapack/dorg2l.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorg2l.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorg2l_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ integer dorg2l_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -69,11 +72,11 @@ doublereal d__1; /* Local variables */ static integer i__, j, l; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); static integer ii; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -152,4 +155,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorg2r.c hypre-2.13.0/src/lapack/dorg2r.c --- hypre-2.11.2/src/lapack/dorg2r.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorg2r.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorg2r_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ integer dorg2r_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -69,7 +72,7 @@ doublereal d__1; /* Local variables */ static integer i__, j, l; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -152,4 +155,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorgbr.c hypre-2.13.0/src/lapack/dorgbr.c --- hypre-2.11.2/src/lapack/dorgbr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorgbr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorgbr_(const char *vect, integer *m, integer *n, integer *k, +/* Subroutine */ integer dorgbr_(const char *vect, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -110,10 +113,10 @@ static integer iinfo; static logical wantq; static integer nb, mn; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ HYPRE_Int dorglq_(integer *, integer *, integer *, + extern /* Subroutine */ integer dorglq_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -277,4 +280,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorgl2.c hypre-2.13.0/src/lapack/dorgl2.c --- hypre-2.11.2/src/lapack/dorgl2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorgl2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorgl2_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ integer dorgl2_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -65,7 +68,7 @@ doublereal d__1; /* Local variables */ static integer i__, j, l; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(const char *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -152,4 +155,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorglq.c hypre-2.13.0/src/lapack/dorglq.c --- hypre-2.11.2/src/lapack/dorglq.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorglq.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorglq_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ integer dorglq_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -82,15 +85,15 @@ integer a_dim1, a_offset, i__1, i__2, i__3; /* Local variables */ static integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ HYPRE_Int dorgl2_(integer *, integer *, integer *, + extern /* Subroutine */ integer dorgl2_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static integer ib, nb, ki, kk; - extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer nx; - extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -261,4 +264,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorgql.c hypre-2.13.0/src/lapack/dorgql.c --- hypre-2.11.2/src/lapack/dorgql.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorgql.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorgql_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ integer dorgql_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -83,15 +86,15 @@ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ static integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ HYPRE_Int dorg2l_(integer *, integer *, integer *, + extern /* Subroutine */ integer dorg2l_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static integer ib, nb, kk; - extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer nx; - extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -262,4 +265,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorgqr.c hypre-2.13.0/src/lapack/dorgqr.c --- hypre-2.11.2/src/lapack/dorgqr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorgqr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorgqr_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ integer dorgqr_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -83,15 +86,15 @@ integer a_dim1, a_offset, i__1, i__2, i__3; /* Local variables */ static integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ HYPRE_Int dorg2r_(integer *, integer *, integer *, + extern /* Subroutine */ integer dorg2r_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static integer ib, nb, ki, kk; - extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer nx; - extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -263,4 +266,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorgtr.c hypre-2.13.0/src/lapack/dorgtr.c --- hypre-2.11.2/src/lapack/dorgtr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorgtr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorgtr_(const char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ integer dorgtr_(const char *uplo, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -81,10 +84,10 @@ static integer iinfo; static logical upper; static integer nb; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ HYPRE_Int dorgql_(integer *, integer *, integer *, + extern /* Subroutine */ integer dorgql_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -228,4 +231,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorm2r.c hypre-2.13.0/src/lapack/dorm2r.c --- hypre-2.11.2/src/lapack/dorm2r.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorm2r.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorm2r_(const char *side,const char *trans, integer *m, integer *n, +/* Subroutine */ integer dorm2r_(const char *side,const char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *info) { @@ -100,12 +103,12 @@ /* Local variables */ static logical left; static integer i__; - extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(const char *,const char *); static integer i1, i2, i3, ic, jc, mi, ni, nq; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static logical notran; static doublereal aii; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -213,4 +216,6 @@ #undef c___ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dormbr.c hypre-2.13.0/src/lapack/dormbr.c --- hypre-2.11.2/src/lapack/dormbr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dormbr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dormbr_(const char *vect,const char *side,const char *trans, integer *m, +/* Subroutine */ integer dormbr_(const char *vect,const char *side,const char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work, integer *lwork, integer *info) @@ -133,19 +136,19 @@ integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2]; char ch__1[2]; /* Builtin functions - Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen); + Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen); /* Local variables */ static logical left; extern logical lsame_(const char *,const char *); static integer iinfo, i1, i2, nb, mi, ni, nq, nw; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ HYPRE_Int dormlq_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dormlq_(const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); static logical notran; - extern /* Subroutine */ HYPRE_Int dormqr_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dormqr_(const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); static logical applyq; @@ -341,4 +344,6 @@ #undef c___ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dorml2.c hypre-2.13.0/src/lapack/dorml2.c --- hypre-2.11.2/src/lapack/dorml2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dorml2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dorml2_(const char *side,const char *trans, integer *m, integer *n, +/* Subroutine */ integer dorml2_(const char *side,const char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *info) { @@ -97,12 +100,12 @@ /* Local variables */ static logical left; static integer i__; - extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(const char *,const char *); static integer i1, i2, i3, ic, jc, mi, ni, nq; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static logical notran; static doublereal aii; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -209,4 +212,6 @@ #undef c___ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dormlq.c hypre-2.13.0/src/lapack/dormlq.c --- hypre-2.11.2/src/lapack/dormlq.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dormlq.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dormlq_(const char *side,const char *trans, integer *m, integer *n, +/* Subroutine */ integer dormlq_(const char *side,const char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -112,23 +115,23 @@ i__5; char ch__1[2]; /* Builtin functions - Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen); + Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen); /* Local variables */ static logical left; static integer i__; static doublereal t[4160] /* was [65][64] */; extern logical lsame_(const char *,const char *); static integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ HYPRE_Int dorml2_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dorml2_(const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static integer ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer nq, nw; - extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -314,4 +317,6 @@ #undef c___ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dormqr.c hypre-2.13.0/src/lapack/dormqr.c --- hypre-2.11.2/src/lapack/dormqr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dormqr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dormqr_(const char *side,const char *trans, integer *m, integer *n, +/* Subroutine */ integer dormqr_(const char *side,const char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -112,23 +115,23 @@ i__5; char ch__1[2]; /* Builtin functions - Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen); + Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen); /* Local variables */ static logical left; static integer i__; static doublereal t[4160] /* was [65][64] */; extern logical lsame_(const char *,const char *); static integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ HYPRE_Int dorm2r_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dorm2r_(const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static integer ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer nq, nw; - extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -306,4 +309,6 @@ #undef c___ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dpotf2.c hypre-2.13.0/src/lapack/dpotf2.c --- hypre-2.11.2/src/lapack/dpotf2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dpotf2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dpotf2_(const char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ integer dpotf2_(const char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -73,19 +76,19 @@ integer a_dim1, a_offset, i__1, i__2, i__3; doublereal d__1; /* Builtin functions */ - /*HYPRE_Real sqrt(doublereal);*/ + /*doublereal sqrt(doublereal);*/ /* Local variables */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); static integer j; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, + extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static doublereal ajj; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -197,4 +200,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dpotrf.c hypre-2.13.0/src/lapack/dpotrf.c --- hypre-2.11.2/src/lapack/dpotrf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dpotrf.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dpotrf_(const char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ integer dpotrf_(const char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -72,20 +75,20 @@ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ static integer j; - extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int dsyrk_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dsyrk_(const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *), dpotf2_(const char *, integer *, doublereal *, integer *, integer *); static integer jb, nb; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -223,4 +226,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dpotrs.c hypre-2.13.0/src/lapack/dpotrs.c --- hypre-2.11.2/src/lapack/dpotrs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dpotrs.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dpotrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ integer dpotrs_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { @@ -64,11 +67,11 @@ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); a_dim1 = *lda; @@ -138,3 +141,6 @@ } /* dpotrs_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsteqr.c hypre-2.13.0/src/lapack/dsteqr.c --- hypre-2.11.2/src/lapack/dsteqr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsteqr.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsteqr_(const char *compz, integer *n, doublereal *d__, +/* Subroutine */ integer dsteqr_(const char *compz, integer *n, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -89,22 +92,22 @@ integer z_dim1, z_offset, i__1, i__2; doublereal d__1, d__2; /* Builtin functions */ - HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *); + doublereal d_sign(doublereal *, doublereal *); /* Local variables */ static integer lend, jtot; - extern /* Subroutine */ HYPRE_Int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ integer dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); static doublereal b, c__, f, g; static integer i__, j, k, l, m; static doublereal p, r__, s; extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dlasr_(const char *,const char *,const char *, integer *, + extern /* Subroutine */ integer dlasr_(const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); static doublereal anorm; - extern /* Subroutine */ HYPRE_Int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ integer dswap_(integer *, doublereal *, integer *, doublereal *, integer *); static integer l1; - extern /* Subroutine */ HYPRE_Int dlaev2_(doublereal *, doublereal *, + extern /* Subroutine */ integer dlaev2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); static integer lendm1, lendp1; @@ -112,17 +115,17 @@ static integer ii; extern doublereal dlamch_(const char *); static integer mm, iscale; - extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlaset_(const char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); static doublereal safmin; - extern /* Subroutine */ HYPRE_Int dlartg_(doublereal *, doublereal *, + extern /* Subroutine */ integer dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); static doublereal safmax; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern doublereal dlanst_(const char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ HYPRE_Int dlasrt_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dlasrt_(const char *, integer *, doublereal *, integer *); static integer lendsv; static doublereal ssfmin; @@ -598,4 +601,6 @@ #undef z___ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsterf.c hypre-2.13.0/src/lapack/dsterf.c --- hypre-2.11.2/src/lapack/dsterf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsterf.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsterf_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ integer dsterf_(integer *n, doublereal *d__, doublereal *e, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -54,11 +57,11 @@ integer i__1; doublereal d__1, d__2, d__3; /* Builtin functions */ - HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *); + doublereal d_sign(doublereal *, doublereal *); /* Local variables */ static doublereal oldc; static integer lend, jtot; - extern /* Subroutine */ HYPRE_Int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ integer dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); static doublereal c__; static integer i__, l, m; @@ -68,14 +71,14 @@ static doublereal bb; extern doublereal dlamch_(const char *); static integer iscale; - extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); static doublereal oldgam, safmin; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static doublereal safmax; extern doublereal dlanst_(const char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ HYPRE_Int dlasrt_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dlasrt_(const char *, integer *, doublereal *, integer *); static integer lendsv; static doublereal ssfmin; @@ -433,3 +436,6 @@ } /* dsterf_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsyev.c hypre-2.13.0/src/lapack/dsyev.c --- hypre-2.11.2/src/lapack/dsyev.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsyev.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsyev_(const char *jobz,const char *uplo, integer *n, doublereal *a, +/* Subroutine */ integer dsyev_(const char *jobz,const char *uplo, integer *n, doublereal *a, integer *lda, doublereal *w, doublereal *work, integer *lwork, integer *info) { @@ -85,15 +88,13 @@ /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; doublereal d__1; - /* Builtin functions */ - HYPRE_Real sqrt(doublereal); /* Local variables */ static integer inde; static doublereal anrm; static integer imax; static doublereal rmin, rmax; /***static integer lopt;***/ - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *); static doublereal sigma; extern logical lsame_(const char *,const char *); @@ -102,21 +103,21 @@ static integer nb; extern doublereal dlamch_(const char *); static integer iscale; - extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); static doublereal safmin; extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static doublereal bignum; static integer indtau; - extern /* Subroutine */ HYPRE_Int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal dlansy_(const char *,const char *, integer *, doublereal *, integer *, doublereal *); static integer indwrk; - extern /* Subroutine */ HYPRE_Int dorgtr_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dorgtr_(const char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dsteqr_(const char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsytrd_(const char *, integer *, doublereal *, integer *, doublereal *, @@ -261,4 +262,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsygs2.c hypre-2.13.0/src/lapack/dsygs2.c --- hypre-2.11.2/src/lapack/dsygs2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsygs2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsygs2_(integer *itype,const char *uplo, integer *n, +/* Subroutine */ integer dsygs2_(integer *itype,const char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { @@ -83,21 +86,21 @@ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; doublereal d__1; /* Local variables */ - extern /* Subroutine */ HYPRE_Int dsyr2_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dsyr2_(const char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer k; - extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int dtrmv_(const char *,const char *,const char *, integer *, + extern /* Subroutine */ integer dtrmv_(const char *,const char *,const char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(const char *,const char *,const char *, integer *, doublereal *, integer *, doublereal *, integer *); static doublereal ct; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); static doublereal akk, bkk; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] @@ -272,4 +275,6 @@ #undef b_ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsygst.c hypre-2.13.0/src/lapack/dsygst.c --- hypre-2.11.2/src/lapack/dsygst.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsygst.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsygst_(integer *itype,const char *uplo, integer *n, +/* Subroutine */ integer dsygst_(integer *itype,const char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { @@ -87,24 +90,24 @@ /* Local variables */ static integer k; extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dtrmm_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dtrmm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsymm_( const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsygs2_( integer *,const char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); static integer kb; - extern /* Subroutine */ HYPRE_Int dsyr2k_(const char *,const char *, integer *, integer *, + extern /* Subroutine */ integer dsyr2k_(const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static integer nb; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] @@ -318,4 +321,6 @@ #undef b_ref #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsygv.c hypre-2.13.0/src/lapack/dsygv.c --- hypre-2.11.2/src/lapack/dsygv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsygv.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsygv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ integer dsygv_(integer *itype, char *jobz, char *uplo, integer * n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *w, doublereal *work, integer *lwork, integer *info) { @@ -118,22 +121,22 @@ /* Local variables */ static integer neig; extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int dtrmm_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dtrmm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static char trans[1]; - extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, + extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int dsyev_(const char *,const char *, integer *, doublereal * + extern /* Subroutine */ integer dsyev_(const char *,const char *, integer *, doublereal * , integer *, doublereal *, doublereal *, integer *, integer *); static logical wantz; static integer nb; - extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *); + extern /* Subroutine */ integer xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ HYPRE_Int dpotrf_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dpotrf_(const char *, integer *, doublereal *, integer *, integer *), dsygst_(integer *,const char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); static integer lwkopt; @@ -254,3 +257,6 @@ } /* dsygv_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsytd2.c hypre-2.13.0/src/lapack/dsytd2.c --- hypre-2.11.2/src/lapack/dsytd2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsytd2.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsytd2_(const char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ integer dsytd2_(const char *uplo, integer *n, doublereal *a, integer * lda, doublereal *d__, doublereal *e, doublereal *tau, integer *info) { /* -- LAPACK routine (version 3.0) -- @@ -127,16 +130,16 @@ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); static doublereal taui; - extern /* Subroutine */ HYPRE_Int dsyr2_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dsyr2_(const char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer i__; static doublereal alpha; extern logical lsame_(const char *,const char *); - extern /* Subroutine */ HYPRE_Int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ integer daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; - extern /* Subroutine */ HYPRE_Int dsymv_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dsymv_(const char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(const char *, integer * @@ -280,4 +283,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/dsytrd.c hypre-2.13.0/src/lapack/dsytrd.c --- hypre-2.11.2/src/lapack/dsytrd.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/dsytrd.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,8 +1,11 @@ -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int dsytrd_(const char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ integer dsytrd_(const char *uplo, integer *n, doublereal *a, integer * lda, doublereal *d__, doublereal *e, doublereal *tau, doublereal * work, integer *lwork, integer *info) { @@ -144,12 +147,12 @@ extern logical lsame_(const char *,const char *); static integer nbmin, iinfo; static logical upper; - extern /* Subroutine */ HYPRE_Int dsytd2_(const char *, integer *, doublereal *, + extern /* Subroutine */ integer dsytd2_(const char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *), dsyr2k_(const char *,const char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); static integer nb, kk, nx; - extern /* Subroutine */ HYPRE_Int dlatrd_(const char *, integer *, integer *, + extern /* Subroutine */ integer dlatrd_(const char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *); extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, @@ -339,4 +342,6 @@ #undef a_ref - +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/f2c.h hypre-2.13.0/src/lapack/f2c.h --- hypre-2.11.2/src/lapack/f2c.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/f2c.h 2017-10-20 17:42:22.000000000 +0000 @@ -25,6 +25,13 @@ #define HYPRE_SEQUENTIAL #endif #include "_hypre_utilities.h" +#include "math.h" + +#if defined(HYPRE_SINGLE) +#define sqrt sqrtf +#elif defined(HYPRE_LONG_DOUBLE) +#define sqrt sqrtl +#endif #ifdef HYPRE_BIGINT typedef long long int HYPRE_LongInt; diff -Nru hypre-2.11.2/src/lapack/_hypre_lapack.h hypre-2.13.0/src/lapack/_hypre_lapack.h --- hypre-2.11.2/src/lapack/_hypre_lapack.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/lapack/_hypre_lapack.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,202 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header file for HYPRE LAPACK + * + *****************************************************************************/ + +#ifndef HYPRE_LAPACK_H +#define HYPRE_LAPACK_H + +#include "_hypre_utilities.h" +#include "fortran.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*-------------------------------------------------------------------------- + * Change all 'hypre_' names based on using HYPRE or external library + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_USING_HYPRE_LAPACK + +#define hypre_dbdsqr hypre_F90_NAME_LAPACK(dbdsqr,DBDSQR) +#define hypre_dgebd2 hypre_F90_NAME_LAPACK(dgebd2,DGEBD2) +#define hypre_dgebrd hypre_F90_NAME_LAPACK(dgebrd,DGEBRD) +#define hypre_dgelq2 hypre_F90_NAME_LAPACK(dgelq2,DGELQ2) +#define hypre_dgelqf hypre_F90_NAME_LAPACK(dgelqf,DGELQF) +#define hypre_dgels hypre_F90_NAME_LAPACK(dgels ,DGELS ) +#define hypre_dgeqr2 hypre_F90_NAME_LAPACK(dgeqr2,DGEQR2) +#define hypre_dgeqrf hypre_F90_NAME_LAPACK(dgeqrf,DGEQRF) +#define hypre_dgesvd hypre_F90_NAME_LAPACK(dgesvd,DGESVD) +#define hypre_dgetf2 hypre_F90_NAME_LAPACK(dgetf2,DGETF2) +#define hypre_dgetrf hypre_F90_NAME_LAPACK(dgetrf,DGETRF) +#define hypre_dgetrs hypre_F90_NAME_LAPACK(dgetrs,DGETRS) +#define hypre_dlasq1 hypre_F90_NAME_LAPACK(dlasq1,DLASQ1) +#define hypre_dlasq2 hypre_F90_NAME_LAPACK(dlasq2,DLASQ2) +#define hypre_dlasrt hypre_F90_NAME_LAPACK(dlasrt,DLASRT) +#define hypre_dorg2l hypre_F90_NAME_LAPACK(dorg2l,DORG2L) +#define hypre_dorg2r hypre_F90_NAME_LAPACK(dorg2r,DORG2R) +#define hypre_dorgbr hypre_F90_NAME_LAPACK(dorgbr,DORGBR) +#define hypre_dorgl2 hypre_F90_NAME_LAPACK(dorgl2,DORGL2) +#define hypre_dorglq hypre_F90_NAME_LAPACK(dorglq,DORGLQ) +#define hypre_dorgql hypre_F90_NAME_LAPACK(dorgql,DORGQL) +#define hypre_dorgqr hypre_F90_NAME_LAPACK(dorgqr,DORGQR) +#define hypre_dorgtr hypre_F90_NAME_LAPACK(dorgtr,DORGTR) +#define hypre_dorm2r hypre_F90_NAME_LAPACK(dorm2r,DORM2R) +#define hypre_dormbr hypre_F90_NAME_LAPACK(dormbr,DORMBR) +#define hypre_dorml2 hypre_F90_NAME_LAPACK(dorml2,DORML2) +#define hypre_dormlq hypre_F90_NAME_LAPACK(dormlq,DORMLQ) +#define hypre_dormqr hypre_F90_NAME_LAPACK(dormqr,DORMQR) +#define hypre_dpotf2 hypre_F90_NAME_LAPACK(dpotf2,DPOTF2) +#define hypre_dpotrf hypre_F90_NAME_LAPACK(dpotrf,DPOTRF) +#define hypre_dpotrs hypre_F90_NAME_LAPACK(dpotrs,DPOTRS) +#define hypre_dsteqr hypre_F90_NAME_LAPACK(dsteqr,DSTEQR) +#define hypre_dsterf hypre_F90_NAME_LAPACK(dsterf,DSTERF) +#define hypre_dsyev hypre_F90_NAME_LAPACK(dsyev ,DSYEV ) +#define hypre_dsygs2 hypre_F90_NAME_LAPACK(dsygs2,DSYGS2) +#define hypre_dsygst hypre_F90_NAME_LAPACK(dsygst,DSYGST) +#define hypre_dsygv hypre_F90_NAME_LAPACK(dsygv ,DSYGV ) +#define hypre_dsytd2 hypre_F90_NAME_LAPACK(dsytd2,DSYTD2) +#define hypre_dsytrd hypre_F90_NAME_LAPACK(dsytrd,DSYTRD) + +#endif + +/*-------------------------------------------------------------------------- + * Prototypes + *--------------------------------------------------------------------------*/ + +/* dbdsqr.c */ +HYPRE_Int hypre_dbdsqr (const char *uplo , HYPRE_Int *n , HYPRE_Int *ncvt , HYPRE_Int *nru , HYPRE_Int *ncc , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *vt , HYPRE_Int *ldvt , HYPRE_Real *u , HYPRE_Int *ldu , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *info ); + +/* dgebd2.c */ +HYPRE_Int hypre_dgebd2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tauq , HYPRE_Real *taup , HYPRE_Real *work , HYPRE_Int *info ); + +/* dgebrd.c */ +HYPRE_Int hypre_dgebrd ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tauq , HYPRE_Real *taup , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dgelq2.c */ +HYPRE_Int hypre_dgelq2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info ); + +/* dgelqf.c */ +HYPRE_Int hypre_dgelqf ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dgels.c */ +HYPRE_Int hypre_dgels ( char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *nrhs , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dgeqr2.c */ +HYPRE_Int hypre_dgeqr2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info ); + +/* dgeqrf.c */ +HYPRE_Int hypre_dgeqrf ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dgesvd.c */ +HYPRE_Int hypre_dgesvd ( char *jobu , char *jobvt , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *s , HYPRE_Real *u , HYPRE_Int *ldu , HYPRE_Real *vt , HYPRE_Int *ldvt , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dgetf2.c */ +HYPRE_Int hypre_dgetf2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *ipiv , HYPRE_Int *info ); + +/* dgetrf.c */ +HYPRE_Int hypre_dgetrf ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *ipiv , HYPRE_Int *info ); + +/* dgetrs.c */ +HYPRE_Int hypre_dgetrs ( const char *trans , HYPRE_Int *n , HYPRE_Int *nrhs , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *ipiv , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info ); + +/* dlasq1.c */ +HYPRE_Int hypre_dlasq1 ( HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *work , HYPRE_Int *info ); + +/* dlasq2.c */ +HYPRE_Int hypre_dlasq2 ( HYPRE_Int *n , HYPRE_Real *z__ , HYPRE_Int *info ); + +/* dlasrt.c */ +HYPRE_Int hypre_dlasrt (const char *id , HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Int *info ); + +/* dorg2l.c */ +HYPRE_Int hypre_dorg2l ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info ); + +/* dorg2r.c */ +HYPRE_Int hypre_dorg2r ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info ); + +/* dorgbr.c */ +HYPRE_Int hypre_dorgbr (const char *vect , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dorgl2.c */ +HYPRE_Int hypre_dorgl2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info ); + +/* dorglq.c */ +HYPRE_Int hypre_dorglq ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dorgql.c */ +HYPRE_Int hypre_dorgql ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dorgqr.c */ +HYPRE_Int hypre_dorgqr ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dorgtr.c */ +HYPRE_Int hypre_dorgtr (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dorm2r.c */ +HYPRE_Int hypre_dorm2r (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *info ); + +/* dormbr.c */ +HYPRE_Int hypre_dormbr (const char *vect ,const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dorml2.c */ +HYPRE_Int hypre_dorml2 (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *info ); + +/* dormlq.c */ +HYPRE_Int hypre_dormlq (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dormqr.c */ +HYPRE_Int hypre_dormqr (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dpotf2.c */ +HYPRE_Int hypre_dpotf2 (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *info ); + +/* dpotrf.c */ +HYPRE_Int hypre_dpotrf (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *info ); + +/* dpotrs.c */ +HYPRE_Int hypre_dpotrs ( char *uplo , HYPRE_Int *n , HYPRE_Int *nrhs , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info ); + +/* dsteqr.c */ +HYPRE_Int hypre_dsteqr (const char *compz , HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *z__ , HYPRE_Int *ldz , HYPRE_Real *work , HYPRE_Int *info ); + +/* dsterf.c */ +HYPRE_Int hypre_dsterf ( HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Int *info ); + +/* dsyev.c */ +HYPRE_Int hypre_dsyev (const char *jobz ,const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *w , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dsygs2.c */ +HYPRE_Int hypre_dsygs2 ( HYPRE_Int *itype ,const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info ); + +/* dsygst.c */ +HYPRE_Int hypre_dsygst ( HYPRE_Int *itype ,const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info ); + +/* dsygv.c */ +HYPRE_Int hypre_dsygv ( HYPRE_Int *itype , char *jobz , char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *w , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +/* dsytd2.c */ +HYPRE_Int hypre_dsytd2 (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tau , HYPRE_Int *info ); + +/* dsytrd.c */ +HYPRE_Int hypre_dsytrd (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info ); + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru hypre-2.11.2/src/lapack/hypre_lapack.h hypre-2.13.0/src/lapack/hypre_lapack.h --- hypre-2.11.2/src/lapack/hypre_lapack.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/hypre_lapack.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,22 +10,46 @@ * $Revision$ ***********************************************************************EHEADER*/ -/* hypre_lapack.h -- Contains LAPACK prototypes needed by Hypre */ +/***** DO NOT use this file outside of the LAPACK directory *****/ -#ifndef HYPRE_LAPACK_H -#define HYPRE_LAPACK_H -#include "f2c.h" -#include "fortran.h" -#include "math.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* -------------------------------------------------------------------------- - * Change all names to hypre_ to avoid link conflicts - * --------------------------------------------------------------------------*/ +/*-------------------------------------------------------------------------- + * This header renames the functions in LAPACK to avoid conflicts + *--------------------------------------------------------------------------*/ + +/* blas */ +#define dasum_ hypre_dasum +#define daxpy_ hypre_daxpy +#define dcopy_ hypre_dcopy +#define ddot_ hypre_ddot +#define dgemm_ hypre_dgemm +#define dgemv_ hypre_dgemv +#define dger_ hypre_dger +#define dnrm2_ hypre_dnrm2 +#define drot_ hypre_drot +#define dscal_ hypre_dscal +#define dswap_ hypre_dswap +#define dsymm_ hypre_dsymm +#define dsymv_ hypre_dsymv +#define dsyr2_ hypre_dsyr2 +#define dsyr2k_ hypre_dsyr2k +#define dsyrk_ hypre_dsyrk +#define dtrmm_ hypre_dtrmm +#define dtrmv_ hypre_dtrmv +#define dtrsm_ hypre_dtrsm +#define dtrsv_ hypre_dtrsv +#define idamax_ hypre_idamax + +/* f2c library routines */ +#define s_cmp hypre_s_cmp +#define s_copy hypre_s_copy +#define s_cat hypre_s_cat +#define d_lg10 hypre_d_lg10 +#define d_sign hypre_d_sign +#define pow_dd hypre_pow_dd +#define pow_di hypre_pow_di +/* lapack */ +#define dbdsqr_ hypre_dbdsqr #define dgebd2_ hypre_dgebd2 #define dgebrd_ hypre_dgebrd #define dgelq2_ hypre_dgelq2 @@ -37,38 +61,19 @@ #define dgetf2_ hypre_dgetf2 #define dgetrf_ hypre_dgetrf #define dgetrs_ hypre_dgetrs -#define dlabad_ hypre_dlabad -#define dlabrd_ hypre_dlabrd -#define dlae2_ hypre_dlae2 -#define dlaev2_ hypre_dlaev2 -#define dlamch_ hypre_dlamch -#define dlamc1_ hypre_dlamc1 -#define dlamc2_ hypre_dlamc2 -#define dlamc3_ hypre_dlamc3 -#define dlamc4_ hypre_dlamc4 -#define dlamc5_ hypre_dlamc5 -#define dlaswp_ hypre_dlaswp -#define dlange_ hypre_dlange -#define dlanst_ hypre_dlanst -#define dlansy_ hypre_dlansy -#define dlapy2_ hypre_dlapy2 -#define dlarf_ hypre_dlarf -#define dlarfb_ hypre_dlarfb -#define dlarfg_ hypre_dlarfg -#define dlarft_ hypre_dlarft -#define dlartg_ hypre_dlartg -#define dlascl_ hypre_dlascl -#define dlaset_ hypre_dlaset -#define dlasr_ hypre_dlasr +#define dlasq1_ hypre_dlasq1 +#define dlasq2_ hypre_dlasq2 #define dlasrt_ hypre_dlasrt -#define dlassq_ hypre_dlassq -#define dlatrd_ hypre_dlatrd #define dorg2l_ hypre_dorg2l #define dorg2r_ hypre_dorg2r +#define dorgbr_ hypre_dorgbr +#define dorgl2_ hypre_dorgl2 +#define dorglq_ hypre_dorglq #define dorgql_ hypre_dorgql #define dorgqr_ hypre_dorgqr #define dorgtr_ hypre_dorgtr #define dorm2r_ hypre_dorm2r +#define dormbr_ hypre_dormbr #define dorml2_ hypre_dorml2 #define dormlq_ hypre_dormlq #define dormqr_ hypre_dormqr @@ -78,273 +83,51 @@ #define dsteqr_ hypre_dsteqr #define dsterf_ hypre_dsterf #define dsyev_ hypre_dsyev +#define dsygs2_ hypre_dsygs2 #define dsygst_ hypre_dsygst #define dsygv_ hypre_dsygv #define dsytd2_ hypre_dsytd2 #define dsytrd_ hypre_dsytrd -#define ieeeck_ hypre_ieeeck -#define ilaenv_ hypre_ilaenv -#define d_lg10_ hypre_d_lg10 -#define d_sign_ hypre_d_sign -#define pow_di_ hypre_pow_di -#define pow_dd_ hypre_pow_dd -#define s_cat_ hypre_s_cat -#define lsame_ hypre_lsame -#define xerbla_ hypre_xerbla -#define dbdsqr_ hypre_dbdsqr -#define dorgbr_ hypre_dorgbr -#define dsygs2_ hypre_dsygs2 -#define dorglq_ hypre_dorglq + +/* lapack auxiliary routines */ +#define dlabad_ hypre_dlabad +#define dlabrd_ hypre_dlabrd #define dlacpy_ hypre_dlacpy -#define dormbr_ hypre_dormbr -#define dlasq1_ hypre_dlasq1 +#define dlae2_ hypre_dlae2 +#define dlaev2_ hypre_dlaev2 +#define dlamch_ hypre_dlamch +#define dlamc1_ hypre_dlamc1 +#define dlamc2_ hypre_dlamc2 +#define dlamc3_ hypre_dlamc3 +#define dlamc4_ hypre_dlamc4 +#define dlamc5_ hypre_dlamc5 +#define dlange_ hypre_dlange +#define dlanst_ hypre_dlanst +#define dlansy_ hypre_dlansy +#define dlapy2_ hypre_dlapy2 +#define dlarf_ hypre_dlarf +#define dlarfb_ hypre_dlarfb +#define dlarfg_ hypre_dlarfg +#define dlarft_ hypre_dlarft +#define dlartg_ hypre_dlartg #define dlas2_ hypre_dlas2 -#define dlasv2_ hypre_dlasv2 -#define dorgl2_ hypre_dorgl2 -#define dlasq2_ hypre_dlasq2 +#define dlascl_ hypre_dlascl +#define dlaset_ hypre_dlaset #define dlasq3_ hypre_dlasq3 #define dlasq4_ hypre_dlasq4 #define dlasq5_ hypre_dlasq5 #define dlasq6_ hypre_dlasq6 +#define dlasr_ hypre_dlasr +#define dlassq_ hypre_dlassq +#define dlasv2_ hypre_dlasv2 +#define dlaswp_ hypre_dlaswp +#define dlatrd_ hypre_dlatrd +#define ieeeck_ hypre_ieeeck +#define ilaenv_ hypre_ilaenv -#define s_cmp hypre_F90_NAME_BLAS(s_cmp,S_CMP) -#define s_copy hypre_F90_NAME_BLAS(s_copy,S_COPY) -#define d_lg10 hypre_d_lg10 -#define d_sign hypre_d_sign -#define pow_dd hypre_pow_dd -#define pow_di hypre_pow_di -#define s_cat hypre_s_cat - -/* -------------------------------------------------------------------------- - * Prototypes - * --------------------------------------------------------------------------*/ - -/* dgebd2.c */ -HYPRE_Int dgebd2_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tauq , doublereal *taup , doublereal *work , integer *info ); - -/* dgebrd.c */ -HYPRE_Int dgebrd_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tauq , doublereal *taup , doublereal *work , integer *lwork , integer *info ); - -/* dgelq2.c */ -HYPRE_Int dgelq2_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info ); - -/* dgelqf.c */ -HYPRE_Int dgelqf_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dgels.c */ -HYPRE_Int dgels_ ( char *trans , integer *m , integer *n , integer *nrhs , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *work , integer *lwork , integer *info ); - -/* dgeqr2.c */ -HYPRE_Int dgeqr2_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info ); - -/* dgeqrf.c */ -HYPRE_Int dgeqrf_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dgesvd.c */ -HYPRE_Int dgesvd_ ( char *jobu , char *jobvt , integer *m , integer *n , doublereal *a , integer *lda , doublereal *s , doublereal *u , integer *ldu , doublereal *vt , integer *ldvt , doublereal *work , integer *lwork , integer *info ); - - -/* dgetf2.c */ -HYPRE_Int dgetf2_( integer *m , integer *n , doublereal *a , integer *lda , integer *ipiv , integer *info ); - -/* dgetrf.c */ -HYPRE_Int dgetrf_( integer *m , integer *n , doublereal *a , integer *lda , integer *ipiv , integer *info ); - -/* dgetrs.c */ -HYPRE_Int dgetrs_( char *trans , integer *n , integer *nrhs , doublereal *a , integer *lda , integer *ipiv , doublereal *b , integer *ldb , integer *info ); - -/* dlabad.c */ -HYPRE_Int dlabad_ ( doublereal *small , doublereal *large ); - -/* dlabrd.c */ -HYPRE_Int dlabrd_ ( integer *m , integer *n , integer *nb , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tauq , doublereal *taup , doublereal *x , integer *ldx , doublereal *y , integer *ldy ); - -/* dlae2.c */ -HYPRE_Int dlae2_ ( doublereal *a , doublereal *b , doublereal *c__ , doublereal *rt1 , doublereal *rt2 ); - -/* dlaev2.c */ -HYPRE_Int dlaev2_ ( doublereal *a , doublereal *b , doublereal *c__ , doublereal *rt1 , doublereal *rt2 , doublereal *cs1 , doublereal *sn1 ); - -/* dlamch.c */ -doublereal dlamch_ (const char *cmach ); -HYPRE_Int dlamc1_ ( integer *beta , integer *t , logical *rnd , logical *ieee1 ); -HYPRE_Int dlamc2_ ( integer *beta , integer *t , logical *rnd , doublereal *eps , integer *emin , doublereal *rmin , integer *emax , doublereal *rmax ); -doublereal dlamc3_ ( doublereal *a , doublereal *b ); -HYPRE_Int dlamc4_ ( integer *emin , doublereal *start , integer *base ); -HYPRE_Int dlamc5_ ( integer *beta , integer *p , integer *emin , logical *ieee , integer *emax , doublereal *rmax ); - - -/* dlaswp.c */ -HYPRE_Int dlaswp_( integer *n , doublereal *a , integer *lda , integer *k1 , integer *k2 , integer *ipiv , integer *incx ); - -/* dlange.c */ -doublereal dlange_ (const char *norm , integer *m , integer *n , doublereal *a , integer *lda , doublereal *work ); - -/* dlanst.c */ -doublereal dlanst_ (const char *norm , integer *n , doublereal *d__ , doublereal *e ); - -/* dlansy.c */ -doublereal dlansy_ (const char *norm ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *work ); - -/* dlapy2.c */ -doublereal dlapy2_ ( doublereal *x , doublereal *y ); - -/* dlarf.c */ -HYPRE_Int dlarf_ (const char *side , integer *m , integer *n , doublereal *v , integer *incv , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work ); - -/* dlarfb.c */ -HYPRE_Int dlarfb_ (const char *side ,const char *trans ,const char *direct ,const char *storev , integer *m , integer *n , integer *k , doublereal *v , integer *ldv , doublereal *t , integer *ldt , doublereal *c__ , integer *ldc , doublereal *work , integer *ldwork ); - -/* dlarfg.c */ -HYPRE_Int dlarfg_ ( integer *n , doublereal *alpha , doublereal *x , integer *incx , doublereal *tau ); - -/* dlarft.c */ -HYPRE_Int dlarft_ (const char *direct ,const char *storev , integer *n , integer *k , doublereal *v , integer *ldv , doublereal *tau , doublereal *t , integer *ldt ); - -/* dlartg.c */ -HYPRE_Int dlartg_ ( doublereal *f , doublereal *g , doublereal *cs , doublereal *sn , doublereal *r__ ); - -/* dlascl.c */ -HYPRE_Int dlascl_ (const char *type__ , integer *kl , integer *ku , doublereal *cfrom , doublereal *cto , integer *m , integer *n , doublereal *a , integer *lda , integer *info ); - -/* dlaset.c */ -HYPRE_Int dlaset_ (const char *uplo , integer *m , integer *n , doublereal *alpha , doublereal *beta , doublereal *a , integer *lda ); - -/* dlasr.c */ -HYPRE_Int dlasr_ (const char *side ,const char *pivot ,const char *direct , integer *m , integer *n , doublereal *c__ , doublereal *s , doublereal *a , integer *lda ); - -/* dlasrt.c */ -HYPRE_Int dlasrt_ (const char *id , integer *n , doublereal *d__ , integer *info ); - -/* dlassq.c */ -HYPRE_Int dlassq_ ( integer *n , doublereal *x , integer *incx , doublereal *scale , doublereal *sumsq ); - -/* dlatrd.c */ -HYPRE_Int dlatrd_ (const char *uplo , integer *n , integer *nb , doublereal *a , integer *lda , doublereal *e , doublereal *tau , doublereal *w , integer *ldw ); - -/* dorg2l.c */ -HYPRE_Int dorg2l_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info ); - -/* dorg2r.c */ -HYPRE_Int dorg2r_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info ); - -/* dorgql.c */ -HYPRE_Int dorgql_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dorgqr.c */ -HYPRE_Int dorgqr_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dorgtr.c */ -HYPRE_Int dorgtr_ (const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dorm2r.c */ -HYPRE_Int dorm2r_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *info ); - -/* dorml2.c */ -HYPRE_Int dorml2_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *info ); - -/* dormlq.c */ -HYPRE_Int dormlq_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *lwork , integer *info ); - -/* dormqr.c */ -HYPRE_Int dormqr_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *lwork , integer *info ); - -/* dpotf2.c */ -HYPRE_Int dpotf2_ (const char *uplo , integer *n , doublereal *a , integer *lda , integer *info ); - -/* dpotrf.c */ -HYPRE_Int dpotrf_ (const char *uplo , integer *n , doublereal *a , integer *lda , integer *info ); - -/* dpotrs.c */ -HYPRE_Int dpotrs_ ( char *uplo , integer *n , integer *nrhs , doublereal *a , integer *lda , doublereal *b , integer *ldb , integer *info ); - -/* dsteqr.c */ -HYPRE_Int dsteqr_ (const char *compz , integer *n , doublereal *d__ , doublereal *e , doublereal *z__ , integer *ldz , doublereal *work , integer *info ); - -/* dsterf.c */ -HYPRE_Int dsterf_ ( integer *n , doublereal *d__ , doublereal *e , integer *info ); - -/* dsyev.c */ -HYPRE_Int dsyev_ (const char *jobz ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *w , doublereal *work , integer *lwork , integer *info ); - -/* dsygst.c */ -HYPRE_Int dsygst_ ( integer *itype ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb , integer *info ); - -/* dsygv.c */ -HYPRE_Int dsygv_ ( integer *itype , char *jobz , char *uplo , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *w , doublereal *work , integer *lwork , integer *info ); - -/* dsytd2.c */ -HYPRE_Int dsytd2_ (const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tau , integer *info ); - -/* dsytrd.c */ -HYPRE_Int dsytrd_ (const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* ieeeck.c */ -integer ieeeck_ ( integer *ispec , real *zero , real *one ); - -/* ilaenv.c */ -integer ilaenv_ ( integer *ispec ,const char *name__ ,const char *opts , integer *n1 , integer *n2 , integer *n3 , integer *n4 , ftnlen name_len , ftnlen opts_len ); - -/* lapack_utils.c */ -HYPRE_Real d_lg10 ( doublereal *x ); -HYPRE_Real d_sign ( doublereal *a , doublereal *b ); -HYPRE_Real pow_di ( doublereal *ap , integer *bp ); -HYPRE_Real pow_dd ( doublereal *ap , doublereal *bp ); -HYPRE_Int s_cat ( char *lp , char *rpp [], ftnlen rnp [], ftnlen *np , ftnlen ll ); - -/* lsame.c */ -logical lsame_ (const char *ca ,const char *cb ); - -/* xerbla.c */ -HYPRE_Int xerbla_ (const char *srname , integer *info ); - -/* dbdsqr.c */ -HYPRE_Int dbdsqr_ (const char *uplo , integer *n , integer *ncvt , integer *nru , integer *ncc , doublereal *d__ , doublereal *e , doublereal *vt , integer *ldvt , doublereal *u , integer *ldu , doublereal *c__ , integer *ldc , doublereal *work , integer *info ); - -/* dorgbr.c */ -HYPRE_Int dorgbr_ (const char *vect , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dsygs2.c */ -HYPRE_Int dsygs2_ ( integer *itype ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb , integer *info ); - -/* dorglq.c */ -HYPRE_Int dorglq_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info ); - -/* dlacpy.c */ -HYPRE_Int dlacpy_ (const char *uplo , integer *m , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb ); - -/* dormbr.c */ -HYPRE_Int dormbr_ (const char *vect ,const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *lwork , integer *info ); - -/* dlasq1.c */ -HYPRE_Int dlasq1_ ( integer *n , doublereal *d__ , doublereal *e , doublereal *work , integer *info ); - -/* dlas2.c */ -HYPRE_Int dlas2_ ( doublereal *f , doublereal *g , doublereal *h__ , doublereal *ssmin , doublereal *ssmax ); - -/* dlasv2.c */ -HYPRE_Int dlasv2_ ( doublereal *f , doublereal *g , doublereal *h__ , doublereal *ssmin , doublereal *ssmax , doublereal *snr , doublereal *csr , doublereal *snl , doublereal *csl ); - -/* dorgl2.c */ -HYPRE_Int dorgl2_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info ); - -/* dlasq2.c */ -HYPRE_Int dlasq2_ ( integer *n , doublereal *z__ , integer *info ); - -/* dlasq3.c */ -HYPRE_Int dlasq3_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , doublereal *dmin__ , doublereal *sigma , doublereal *desig , doublereal *qmax , integer *nfail , integer *iter , integer *ndiv , logical *ieee ); - -/* dlasq4.c */ -HYPRE_Int dlasq4_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , integer *n0in , doublereal *dmin__ , doublereal *dmin1 , doublereal *dmin2 , doublereal *dn , doublereal *dn1 , doublereal *dn2 , doublereal *tau , integer *ttype ); - -/* dlasq5.c */ -HYPRE_Int dlasq5_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , doublereal *tau , doublereal *dmin__ , doublereal *dmin1 , doublereal *dmin2 , doublereal *dn , doublereal *dnm1 , doublereal *dnm2 , logical *ieee ); - -/* dlasq6.c */ -HYPRE_Int dlasq6_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , doublereal *dmin__ , doublereal *dmin1 , doublereal *dmin2 , doublereal *dn , doublereal *dnm1 , doublereal *dnm2 ); - -#ifdef __cplusplus -} -#endif +/* these auxiliary routines have a different definition in BLAS */ +#define lsame_ hypre_lapack_lsame +#define xerbla_ hypre_lapack_xerbla -#endif +/* this is needed so that lapack can call external BLAS */ +#include "_hypre_blas.h" diff -Nru hypre-2.11.2/src/lapack/ieeeck.c hypre-2.13.0/src/lapack/ieeeck.c --- hypre-2.11.2/src/lapack/ieeeck.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/ieeeck.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,6 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" integer ieeeck_(integer *ispec, real *zero, real *one) { @@ -149,3 +152,6 @@ return ret_val; } /* ieeeck_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/ilaenv.c hypre-2.13.0/src/lapack/ilaenv.c --- hypre-2.11.2/src/lapack/ilaenv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/ilaenv.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "hypre_lapack.h" #include "f2c.h" -#include "hypre_blas.h" +#include "hypre_lapack.h" integer ilaenv_(integer *ispec,const char *name__,const char *opts, integer *n1, integer *n2, integer *n3, integer *n4, ftnlen name_len, ftnlen @@ -113,8 +115,9 @@ /* Builtin functions Subroutine */ - //VOID s_copy( char *, char *, ftnlen, ftnlen); - //integer s_cmp( char *, char *, ftnlen, ftnlen); + /* Builtin functions */ + /* Subroutine */ integer s_copy(char *, const char *, ftnlen, ftnlen); + integer s_cmp(char *, const char *, ftnlen, ftnlen); /* Local variables */ static integer i__; @@ -228,7 +231,7 @@ In these examples, separate code is provided for setting NB for real and complex. We assume that NB will take the same value in - single or HYPRE_Real precision. */ + single or doublereal precision. */ nb = 1; @@ -615,3 +618,6 @@ } /* ilaenv_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/lapack_utils.c hypre-2.13.0/src/lapack/lapack_utils.c --- hypre-2.11.2/src/lapack/lapack_utils.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/lapack_utils.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,123 +0,0 @@ -/*BHEADER********************************************************************** - * Copyright (c) 2008, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * This file is part of HYPRE. See file COPYRIGHT for details. - * - * HYPRE is free software; you can redistribute it and/or modify it under the - * terms of the GNU Lesser General Public License (as published by the Free - * Software Foundation) version 2.1 dated February 1999. - * - * $Revision$ - ***********************************************************************EHEADER*/ - - - - -#include "f2c.h" -#include "hypre_lapack.h" - -#define log10e 0.43429448190325182765 - -#ifdef KR_headers -HYPRE_Real log(); -HYPRE_Real d_lg10(x) doublereal *x; -#else -/* -#undef abs -#include "math.h" -*/ -/*FIXME: need to figure out how log calls to optimize this part*/ -#include "math.h" -//HYPRE_Real log(HYPRE_Real); /* declaration added 2/17/00 */ -HYPRE_Real d_lg10(doublereal *x) -#endif -{ -return( log10e * log(*x) ); -} -#include "f2c.h" - -#ifdef KR_headers -HYPRE_Real d_sign(a,b) doublereal *a, *b; -#else -HYPRE_Real d_sign(doublereal *a, doublereal *b) -#endif -{ -HYPRE_Real x; -x = (*a >= 0 ? *a : - *a); -return( *b >= 0 ? x : -x); -} - -#include "f2c.h" - -#ifdef KR_headers -HYPRE_Real pow_di(ap, bp) doublereal *ap; integer *bp; -#else -HYPRE_Real pow_di(doublereal *ap, integer *bp) -#endif -{ -HYPRE_Real pow, x; -integer n; - -pow = 1; -x = *ap; -n = *bp; - -if(n != 0) - { - if(n < 0) - { - n = -n; - x = 1/x; - } - for( ; ; ) - { - if(n & 01) - pow *= x; - if(n >>= 1) - x *= x; - else - break; - } - } -return(pow); -} -#include "f2c.h" - -#ifdef KR_headers -HYPRE_Real pow(); -HYPRE_Real pow_dd(ap, bp) doublereal *ap, *bp; -#else -#undef abs -#include "math.h" -HYPRE_Real pow_dd(doublereal *ap, doublereal *bp) -#endif -{ -return(pow(*ap, *bp) ); -} - -#include "f2c.h" - -#ifdef KR_headers -HYPRE_Int s_cat(lp, rpp, rnp, np, ll) char *lp, *rpp[]; ftnlen rnp[], *np, ll; -#else -HYPRE_Int s_cat(char *lp, char *rpp[], ftnlen rnp[], ftnlen *np, ftnlen ll) -#endif -{ -ftnlen i, n, nc; -char *f__rp; - -n = (HYPRE_Int)*np; -for(i = 0 ; i < n ; ++i) - { - nc = ll; - if(rnp[i] < nc) - nc = rnp[i]; - ll -= nc; - f__rp = rpp[i]; - while(--nc >= 0) - *lp++ = *f__rp++; - } -while(--ll >= 0) - *lp++ = ' '; -return 0; -} diff -Nru hypre-2.11.2/src/lapack/lsame.c hypre-2.13.0/src/lapack/lsame.c --- hypre-2.11.2/src/lapack/lsame.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/lsame.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,7 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include "../blas/hypre_blas.h" -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" logical lsame_(const char *ca,const char *cb) { @@ -104,3 +106,6 @@ return ret_val; } /* lsame_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lapack/Makefile hypre-2.13.0/src/lapack/Makefile --- hypre-2.11.2/src/lapack/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -15,7 +15,7 @@ include ../config/Makefile.config -C_COMPILE_FLAGS = ${INCLUDES} -I.. -I../utilities +C_COMPILE_FLAGS = ${INCLUDES} -I.. -I../blas -I../utilities LAPACK_HEADERS = f2c.h hypre_lapack.h LAPACK_FILES = \ @@ -86,7 +86,6 @@ dsytrd.c\ ieeeck.c\ ilaenv.c\ - lapack_utils.c\ lsame.c\ xerbla.c @@ -99,6 +98,7 @@ all: ${OBJS} dlamch.o install: all + cp -fR $(srcdir)/_hypre_lapack.h $(HYPRE_INC_INSTALL) clean: rm -rf *.o @@ -113,7 +113,4 @@ dlamch.o : dlamch.c ${LAPACK_HEADERS} ${CC} ${CFLAGS} -c dlamch.c -ilaenv.o : ilaenv.c ${LAPACK_HEADERS} - ${CC} ${CFLAGS} -I../blas -c ilaenv.c - ${OBJS}: ${LAPACK_HEADERS} diff -Nru hypre-2.11.2/src/lapack/README hypre-2.13.0/src/lapack/README --- hypre-2.11.2/src/lapack/README 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/lapack/README 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,5 @@ + +HYPRE LAPACK README file + +- See blas/README file for instructions on how to add LAPACK routines. + diff -Nru hypre-2.11.2/src/lapack/xerbla.c hypre-2.13.0/src/lapack/xerbla.c --- hypre-2.11.2/src/lapack/xerbla.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lapack/xerbla.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,9 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif -#include -#include "hypre_lapack.h" #include "f2c.h" +#include "hypre_lapack.h" -/* Subroutine */ HYPRE_Int xerbla_(const char *srname, integer *info) +/* Subroutine */ integer xerbla_(const char *srname, integer *info) { /* -- LAPACK auxiliary routine (version 2.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., @@ -36,10 +38,13 @@ */ hypre_printf("** On entry to %6s, parameter number %2i had an illegal value\n", - srname, (HYPRE_Int)*info); + srname, (integer)*info); /* End of XERBLA */ return 0; } /* xerbla_ */ +#ifdef __cplusplus +} +#endif diff -Nru hypre-2.11.2/src/lib/Makefile hypre-2.13.0/src/lib/Makefile --- hypre-2.11.2/src/lib/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/lib/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -12,7 +12,6 @@ include ../config/Makefile.config -SUPERLUFILES = ${HYPRE_FEI_SUPERLU_FILES} FEIHYPREFILES = ${HYPRE_FEI_HYPRE_FILES} FEMLIFILES = ${HYPRE_FEI_FEMLI_FILES} IJMVFILES = ${HYPRE_SRC_TOP_DIR}/IJ_mv/*.o @@ -32,11 +31,10 @@ STRUCTLSFILES = ${HYPRE_SRC_TOP_DIR}/struct_ls/*.o STRUCTMVFILES = ${HYPRE_SRC_TOP_DIR}/struct_mv/*.o UTILITIESFILES = ${HYPRE_SRC_TOP_DIR}/utilities/*.o -BLASFILES = ${HYPRE_BLAS_FILES} -LAPACKFILES = ${HYPRE_LAPACK_FILES} +BLASFILES = ${HYPRE_SRC_TOP_DIR}/blas/*.o +LAPACKFILES = ${HYPRE_SRC_TOP_DIR}/lapack/*.o FILES_HYPRE = \ -$(SUPERLUFILES)\ $(FEIHYPREFILES)\ $(FEMLIFILES)\ $(IJMVFILES)\ @@ -86,7 +84,7 @@ libHYPRE.a: ${FILES_HYPRE} @echo "Building libHYPRE ... " - ${AR} $@ $(SUPERLUFILES) $(FEIHYPREFILES) $(FEMLIFILES) $(IJMVFILES) + ${AR} $@ $(FEIHYPREFILES) $(FEMLIFILES) $(IJMVFILES) ${AR} $@ $(EUCLIDFILES) ${AR} $@ $(PARASAILSFILES) ${AR} $@ $(PILUTFILES) diff -Nru hypre-2.11.2/src/Makefile hypre-2.13.0/src/Makefile --- hypre-2.11.2/src/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -10,14 +10,15 @@ # $Revision$ #EHEADER********************************************************************** +default: all # Include all variables defined by configure include config/Makefile.config # These are the directories for internal blas, lapack and general utilities HYPRE_BASIC_DIRS =\ - ${HYPRE_BLAS_SRC_DIR}\ - ${HYPRE_LAPACK_SRC_DIR}\ + blas\ + lapack\ utilities #These are the directories for multivector diff -Nru hypre-2.11.2/src/multivector/backup.c hypre-2.13.0/src/multivector/backup.c --- hypre-2.11.2/src/multivector/backup.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/multivector/backup.c 2017-10-20 17:42:22.000000000 +0000 @@ -132,10 +132,10 @@ hypre_assert( data != NULL ); - srand( seed ); + hypre_SeedRand( seed ); for ( i = 0; i < data->numVectors; i++ ) { if ( data->mask == NULL || (data->mask)[i] ) { - seed = rand(); + seed = hypre_RandI(); (data->interpreter->SetRandomValues)(data->vector[i],seed); } } diff -Nru hypre-2.11.2/src/multivector/csr_matmultivec.c hypre-2.13.0/src/multivector/csr_matmultivec.c --- hypre-2.11.2/src/multivector/csr_matmultivec.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/multivector/csr_matmultivec.c 2017-10-20 17:42:22.000000000 +0000 @@ -114,8 +114,8 @@ temp = y_data[i]; for (jj = A_i[i]; jj < A_i[i+1]; jj++) temp += A_data[jj] * x_data[A_j[jj]]; + y_data[i] = temp; } - y_data[i] = temp; } else { diff -Nru hypre-2.11.2/src/parcsr_block_mv/csr_block_matrix.c hypre-2.13.0/src/parcsr_block_mv/csr_block_matrix.c --- hypre-2.11.2/src/parcsr_block_mv/csr_block_matrix.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_block_mv/csr_block_matrix.c 2017-10-20 17:42:22.000000000 +0000 @@ -18,16 +18,6 @@ #include "_hypre_parcsr_block_mv.h" -#define LB_VERSION 0 /* lapack and blas version 5/12/06 - preliminary testing - shows this is slower for block sizes < 4 (did not try - larger block sizes)- need to edit the Makefile to find - blas and lapack */ - -#if LB_VERSION -#include "hypre_blas.h" -#include "hypre_lapack.h" -#endif - /*-------------------------------------------------------------------------- * hypre_CSRBlockMatrixCreate *--------------------------------------------------------------------------*/ diff -Nru hypre-2.11.2/src/parcsr_ls/ame.c hypre-2.13.0/src/parcsr_ls/ame.c --- hypre-2.11.2/src/parcsr_ls/ame.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/ame.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,10 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - - #include "_hypre_parcsr_ls.h" #include "float.h" #include "ams.h" @@ -588,13 +584,8 @@ lobpcg_Tolerance lobpcg_tol; HYPRE_Real *residuals; -#ifdef HYPRE_USING_ESSL - blap_fn.dsygv = dsygv; - blap_fn.dpotrf = dpotrf; -#else - blap_fn.dsygv = hypre_F90_NAME_LAPACK(dsygv,DSYGV); - blap_fn.dpotrf = hypre_F90_NAME_LAPACK(dpotrf,DPOTRF); -#endif + blap_fn.dsygv = hypre_dsygv; + blap_fn.dpotrf = hypre_dpotrf; lobpcg_tol.relative = ame_data -> rtol; lobpcg_tol.absolute = ame_data -> atol; residuals = hypre_TAlloc(HYPRE_Real, ame_data -> block_size); diff -Nru hypre-2.11.2/src/parcsr_ls/ame.h hypre-2.13.0/src/parcsr_ls/ame.h --- hypre-2.11.2/src/parcsr_ls/ame.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/ame.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,10 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - - #ifndef hypre_AME_HEADER #define hypre_AME_HEADER @@ -56,16 +52,6 @@ } hypre_AMEData; -#include "fortran.h" -#ifdef __cplusplus -extern "C" { -#endif - -HYPRE_Int hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dsygv,DSYGV)(HYPRE_Int *, char *, char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, - HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -#ifdef __cplusplus -} -#endif +#include "_hypre_lapack.h" #endif diff -Nru hypre-2.11.2/src/parcsr_ls/amg_hybrid.c hypre-2.13.0/src/parcsr_ls/amg_hybrid.c --- hypre-2.11.2/src/parcsr_ls/amg_hybrid.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/amg_hybrid.c 2017-10-20 17:42:22.000000000 +0000 @@ -123,13 +123,13 @@ (AMGhybrid_data -> strong_threshold) = 0.25; (AMGhybrid_data -> max_row_sum) = 0.9; (AMGhybrid_data -> trunc_factor) = 0.0; - (AMGhybrid_data -> pmax) = 0; + (AMGhybrid_data -> pmax) = 4; (AMGhybrid_data -> max_levels) = 25; (AMGhybrid_data -> measure_type) = 0; - (AMGhybrid_data -> coarsen_type) = 6; - (AMGhybrid_data -> interp_type) = 0; + (AMGhybrid_data -> coarsen_type) = 10; + (AMGhybrid_data -> interp_type) = 6; (AMGhybrid_data -> cycle_type) = 1; - (AMGhybrid_data -> relax_order) = 1; + (AMGhybrid_data -> relax_order) = 0; (AMGhybrid_data -> max_coarse_size) = 9; (AMGhybrid_data -> min_coarse_size) = 1; (AMGhybrid_data -> seq_threshold) = 0; @@ -1860,9 +1860,15 @@ boom_grt = hypre_CTAlloc(HYPRE_Int,4); for (i=0; i < 4; i++) boom_grt[i] = grid_relax_type[i]; - if (solver_type == 1 && grid_relax_type[1] == 3 && - grid_relax_type[2] == 3) - boom_grt[2] = 4; + hypre_BoomerAMGSetGridRelaxType(pcg_precond, boom_grt); + } + else + { + boom_grt = hypre_CTAlloc(HYPRE_Int,4); + boom_grt[0] = 3; + boom_grt[1] = 13; + boom_grt[2] = 14; + boom_grt[3] = 9; hypre_BoomerAMGSetGridRelaxType(pcg_precond, boom_grt); } if (relax_weight) diff -Nru hypre-2.11.2/src/parcsr_ls/ams.c hypre-2.13.0/src/parcsr_ls/ams.c --- hypre-2.11.2/src/parcsr_ls/ams.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/ams.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,10 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - - #include "_hypre_parcsr_ls.h" #include "float.h" #include "ams.h" @@ -24,7 +20,7 @@ * Relaxation on the ParCSR matrix A with right-hand side f and * initial guess u. Possible values for relax_type are: * - * 1 = l1-scaled Jacobi + * 1 = l1-scaled (or weighted) Jacobi * 2 = l1-scaled block Gauss-Seidel/SSOR * 3 = Kaczmarz * 4 = truncated version of 2 (Remark 6.2 in smoothers paper) @@ -70,14 +66,27 @@ { if (relax_type == 1) /* l1-scaled Jacobi */ { - HYPRE_Int i, num_rows = hypre_ParCSRMatrixNumRows(A); - + PUSH_RANGE_PAYLOAD("RELAX",4,sweep); + HYPRE_Int i, num_rows = hypre_ParCSRMatrixNumRows(A); +#ifdef HYPRE_USE_GPU + if (sweep==0){ + hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(v)); + hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(f)); + } + VecCopy(v_data,f_data,hypre_VectorSize(hypre_ParVectorLocalVector(v)),HYPRE_STREAM(4)); +#else hypre_ParVectorCopy(f,v); +#endif hypre_ParCSRMatrixMatvec(-relax_weight, A, u, relax_weight, v); - +#ifdef HYPRE_USE_GPU + + VecScale(u_data,v_data,l1_norms,num_rows,HYPRE_STREAM(4)); +#else /* u += w D^{-1}(f - A u), where D_ii = ||A(i,:)||_1 */ for (i = 0; i < num_rows; i++) u_data[i] += v_data[i] / l1_norms[i]; +#endif + POP_RANGE; } else if (relax_type == 2 || relax_type == 4) /* offd-l1-scaled block GS */ { @@ -704,6 +713,18 @@ l1_norm[i] = diag; } } + else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */ + { + for (i = 0; i < num_rows; i++) + { + diag = A_diag_data[A_diag_I[i]]; + if (diag != 0.0) l1_norm[i] = diag; + else l1_norm[i] = 1.0; + } + *l1_norm_ptr = l1_norm; + + return hypre_error_flag; + } /* Handle negative definite matrices */ for (i = 0; i < num_rows; i++) @@ -718,6 +739,7 @@ break; } + //for (i = 0; i < num_rows; i++) l1_norm[i]=1.0/l1_norm[i]; hypre_TFree(cf_marker_offd); *l1_norm_ptr = l1_norm; @@ -1077,7 +1099,7 @@ ams_data -> A_Pi = A_Pi; /* Penalize the eliminated degrees of freedom */ - hypre_ParCSRMatrixSetDiagRows(A_Pi, DBL_MAX); + hypre_ParCSRMatrixSetDiagRows(A_Pi, HYPRE_REAL_MAX); /* Make sure that the first entry in each row is the diagonal one. */ /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_Pi)); */ @@ -1106,7 +1128,7 @@ else { /* Penalize the eliminated degrees of freedom */ - hypre_ParCSRMatrixSetDiagRows(A_G, DBL_MAX); + hypre_ParCSRMatrixSetDiagRows(A_G, HYPRE_REAL_MAX); /* Make sure that the first entry in each row is the diagonal one. */ /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_G)); */ diff -Nru hypre-2.11.2/src/parcsr_ls/F90_HYPRE_parcsr_amg.c hypre-2.13.0/src/parcsr_ls/F90_HYPRE_parcsr_amg.c --- hypre-2.11.2/src/parcsr_ls/F90_HYPRE_parcsr_amg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/F90_HYPRE_parcsr_amg.c 2017-10-20 17:42:22.000000000 +0000 @@ -1743,6 +1743,54 @@ } /*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetChebyScale + *--------------------------------------------------------------------------*/ + +void +hypre_F90_IFACE(hypre_boomeramgsetchebyscale, HYPRE_BOOMERAMGSETCHEBYSCALE) + ( hypre_F90_Obj *solver, + hypre_F90_Int *cheby_scale, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGSetChebyScale( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassInt (cheby_scale) ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetChebyVariant + *--------------------------------------------------------------------------*/ + +void +hypre_F90_IFACE(hypre_boomeramgsetchebyvariant, HYPRE_BOOMERAMGSETCHEBYVARIANT) + ( hypre_F90_Obj *solver, + hypre_F90_Int *cheby_variant, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGSetChebyVariant( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassInt (cheby_variant) ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetChebyEigEst + *--------------------------------------------------------------------------*/ + +void +hypre_F90_IFACE(hypre_boomeramgsetchebyeigest, HYPRE_BOOMERAMGSETCHEBYEIGEST) + ( hypre_F90_Obj *solver, + hypre_F90_Int *cheby_eig_est, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGSetChebyEigEst( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassInt (cheby_eig_est) ) ); +} + +/*-------------------------------------------------------------------------- * HYPRE_BoomerAMGSetKeepTranspose *--------------------------------------------------------------------------*/ @@ -1775,7 +1823,7 @@ } /*-------------------------------------------------------------------------- - * HYPRE_BoomerAMGSetAdditive + * HYPRE_BoomerAMGSetAdditive, HYPRE_BoomerAMGGetAdditive *--------------------------------------------------------------------------*/ void @@ -1790,8 +1838,20 @@ hypre_F90_PassInt (add_lvl) ) ); } +void +hypre_F90_IFACE(hypre_boomeramggetadditive, HYPRE_BOOMERAMGGETADDITIVE) + ( hypre_F90_Obj *solver, + hypre_F90_Int *add_lvl, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGGetAdditive( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassIntRef (add_lvl) ) ); +} + /*-------------------------------------------------------------------------- - * HYPRE_BoomerAMGSetMultAdditive + * HYPRE_BoomerAMGSetMultAdditive, HYPRE BoomerAMGGetMultAdditive *--------------------------------------------------------------------------*/ void @@ -1806,8 +1866,20 @@ hypre_F90_PassInt (add_lvl) ) ); } +void +hypre_F90_IFACE(hypre_boomeramggetmultadd, HYPRE_BOOMERAMGGETMULTADD) + ( hypre_F90_Obj *solver, + hypre_F90_Int *add_lvl, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGGetMultAdditive( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassIntRef (add_lvl) ) ); +} + /*-------------------------------------------------------------------------- - * HYPRE_BoomerAMGSetSimple + * HYPRE_BoomerAMGSetSimple, HYPRE_BoomerAMGGetSimple *--------------------------------------------------------------------------*/ void @@ -1822,6 +1894,34 @@ hypre_F90_PassInt (add_lvl) ) ); } +void +hypre_F90_IFACE(hypre_boomeramggetsimple, HYPRE_BOOMERAMGGETSIMPLE) + ( hypre_F90_Obj *solver, + hypre_F90_Int *add_lvl, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGGetSimple( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassIntRef (add_lvl) ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetAddLastLvl + *--------------------------------------------------------------------------*/ + +void +hypre_F90_IFACE(hypre_boomeramgsetaddlastlvl, HYPRE_BOOMERAMGSETADDLASTLVL) + ( hypre_F90_Obj *solver, + hypre_F90_Int *add_last_lvl, + hypre_F90_Int *ierr ) +{ + *ierr = (hypre_F90_Int) + ( HYPRE_BoomerAMGSetAddLastLvl( + hypre_F90_PassObj (HYPRE_Solver, solver), + hypre_F90_PassInt (add_last_lvl) ) ); +} + /*-------------------------------------------------------------------------- * HYPRE_BoomerAMGSetMultAddTruncFactor *--------------------------------------------------------------------------*/ diff -Nru hypre-2.11.2/src/parcsr_ls/gen_redcs_mat.c hypre-2.13.0/src/parcsr_ls/gen_redcs_mat.c --- hypre-2.11.2/src/parcsr_ls/gen_redcs_mat.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/gen_redcs_mat.c 2017-10-20 17:42:22.000000000 +0000 @@ -495,7 +495,7 @@ *new_comm_ptr = new_comm; return 0; } - ranks = hypre_CTAlloc(HYPRE_Int, new_num_procs+2); + ranks = hypre_HostCTAlloc(HYPRE_Int, new_num_procs+2); if (new_num_procs == 1) { if (participate) my_info = my_id; @@ -503,8 +503,9 @@ } else { - info = hypre_CTAlloc(HYPRE_Int, new_num_procs+2); - list_len = hypre_CTAlloc(HYPRE_Int, 1); + info = hypre_HostCTAlloc(HYPRE_Int, new_num_procs+2); + list_len = hypre_HostCTAlloc(HYPRE_Int, 1); + if (participate) { @@ -522,8 +523,9 @@ hypre_MPI_Allreduce(info, ranks, list_len[0], HYPRE_MPI_INT, hypre_MPI_MERGE, comm); hypre_MPI_Op_free (&hypre_MPI_MERGE); - hypre_TFree(list_len); - hypre_TFree(info); + + hypre_HostTFree(list_len); + hypre_HostTFree(info); } hypre_MPI_Comm_size(comm,&num_procs); hypre_MPI_Comm_group(comm, &orig_group); @@ -532,7 +534,7 @@ hypre_MPI_Group_free(&new_group); hypre_MPI_Group_free(&orig_group); - hypre_TFree(ranks); + hypre_HostTFree(ranks); *new_comm_ptr = new_comm; diff -Nru hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_amg.c hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_amg.c --- hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_amg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_amg.c 2017-10-20 17:42:22.000000000 +0000 @@ -1466,7 +1466,7 @@ return( hypre_BoomerAMGSetChebyOrder( (void *) solver, order ) ); } /*-------------------------------------------------------------------------- - * HYPRE_BoomerAMGSetChebyEigRatio + * HYPRE_BoomerAMGSetChebyFraction *--------------------------------------------------------------------------*/ HYPRE_Int @@ -1475,10 +1475,43 @@ { return( hypre_BoomerAMGSetChebyFraction( (void *) solver, ratio ) ); } + /*-------------------------------------------------------------------------- - * HYPRE_BoomerAMGSetInterpVectors + * HYPRE_BoomerAMGSetChebyScale + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_BoomerAMGSetChebyScale( HYPRE_Solver solver, + HYPRE_Int scale ) +{ + return( hypre_BoomerAMGSetChebyScale( (void *) solver, scale ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetChebyVariant *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_BoomerAMGSetChebyVariant( HYPRE_Solver solver, + HYPRE_Int variant ) +{ + return( hypre_BoomerAMGSetChebyVariant( (void *) solver, variant ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetChebyEigEst + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_BoomerAMGSetChebyEigEst( HYPRE_Solver solver, + HYPRE_Int eig_est ) +{ + return( hypre_BoomerAMGSetChebyEigEst( (void *) solver, eig_est ) ); +} +/*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetInterpVectors + *--------------------------------------------------------------------------*/ HYPRE_Int HYPRE_BoomerAMGSetInterpVectors (HYPRE_Solver solver, HYPRE_Int num_vectors, HYPRE_ParVector *vectors) @@ -1604,6 +1637,17 @@ } /*-------------------------------------------------------------------------- + * HYPRE_BoomerAMGSetAddLastLvl + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_BoomerAMGSetAddLastLvl( HYPRE_Solver solver, + HYPRE_Int add_last_lvl ) +{ + return( hypre_BoomerAMGSetAddLastLvl( (void *) solver, add_last_lvl ) ); +} + +/*-------------------------------------------------------------------------- * HYPRE_BoomerAMGSetNonGalerkinTol *--------------------------------------------------------------------------*/ @@ -1660,3 +1704,11 @@ return (hypre_BoomerAMGSetKeepTranspose ( (void *) solver, keepTranspose ) ); } +HYPRE_Int +HYPRE_BoomerAMGSetCpointsToKeep(HYPRE_Solver solver, + HYPRE_Int cpt_coarse_level, + HYPRE_Int num_cpt_coarse, + HYPRE_Int *cpt_coarse_index) +{ + return (hypre_BoomerAMGSetCpointsToKeep( (void *) solver, cpt_coarse_level, num_cpt_coarse, cpt_coarse_index)); +} diff -Nru hypre-2.11.2/src/parcsr_ls/_hypre_parcsr_ls.h hypre-2.13.0/src/parcsr_ls/_hypre_parcsr_ls.h --- hypre-2.11.2/src/parcsr_ls/_hypre_parcsr_ls.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/_hypre_parcsr_ls.h 2017-10-20 17:42:22.000000000 +0000 @@ -167,8 +167,13 @@ HYPRE_Real *max_eig_est; HYPRE_Real *min_eig_est; - HYPRE_Int cheby_order; + HYPRE_Int cheby_eig_est; + HYPRE_Int cheby_order; + HYPRE_Int cheby_variant; + HYPRE_Int cheby_scale; HYPRE_Real cheby_fraction; + HYPRE_Real **cheby_ds; + HYPRE_Real **cheby_coefs; /* data needed for non-Galerkin option */ HYPRE_Int nongalerk_num_tol; @@ -240,18 +245,23 @@ HYPRE_Int additive; HYPRE_Int mult_additive; HYPRE_Int simple; + HYPRE_Int add_last_lvl; HYPRE_Int add_P_max_elmts; HYPRE_Real add_trunc_factor; HYPRE_Int add_rlx_type; HYPRE_Real add_rlx_wt; hypre_ParCSRMatrix *Lambda; + hypre_ParCSRMatrix *Atilde; hypre_ParVector *Rtilde; hypre_ParVector *Xtilde; HYPRE_Real *D_inv; HYPRE_Int rap2; HYPRE_Int keepTranspose; - +/* information for preserving indexes as coarse grid points */ + HYPRE_Int C_point_keep_level; + HYPRE_Int num_C_point_marker; + HYPRE_Int **C_point_marker_array; } hypre_ParAMGData; /*-------------------------------------------------------------------------- @@ -363,7 +373,11 @@ #define hypre_ParAMGDataMinEigEst(amg_data) ((amg_data)->min_eig_est) #define hypre_ParAMGDataChebyOrder(amg_data) ((amg_data)->cheby_order) #define hypre_ParAMGDataChebyFraction(amg_data) ((amg_data)->cheby_fraction) - +#define hypre_ParAMGDataChebyEigEst(amg_data) ((amg_data)->cheby_eig_est) +#define hypre_ParAMGDataChebyVariant(amg_data) ((amg_data)->cheby_variant) +#define hypre_ParAMGDataChebyScale(amg_data) ((amg_data)->cheby_scale) +#define hypre_ParAMGDataChebyDS(amg_data) ((amg_data)->cheby_ds) +#define hypre_ParAMGDataChebyCoefs(amg_data) ((amg_data)->cheby_coefs) /* block */ #define hypre_ParAMGDataABlockArray(amg_data) ((amg_data)->A_block_array) @@ -437,11 +451,13 @@ #define hypre_ParAMGDataAdditive(amg_data) ((amg_data)->additive) #define hypre_ParAMGDataMultAdditive(amg_data) ((amg_data)->mult_additive) #define hypre_ParAMGDataSimple(amg_data) ((amg_data)->simple) +#define hypre_ParAMGDataAddLastLvl(amg_data) ((amg_data)->add_last_lvl) #define hypre_ParAMGDataMultAddPMaxElmts(amg_data) ((amg_data)->add_P_max_elmts) #define hypre_ParAMGDataMultAddTruncFactor(amg_data) ((amg_data)->add_trunc_factor) #define hypre_ParAMGDataAddRelaxType(amg_data) ((amg_data)->add_rlx_type) #define hypre_ParAMGDataAddRelaxWt(amg_data) ((amg_data)->add_rlx_wt) #define hypre_ParAMGDataLambda(amg_data) ((amg_data)->Lambda) +#define hypre_ParAMGDataAtilde(amg_data) ((amg_data)->Atilde) #define hypre_ParAMGDataRtilde(amg_data) ((amg_data)->Rtilde) #define hypre_ParAMGDataXtilde(amg_data) ((amg_data)->Xtilde) #define hypre_ParAMGDataDinv(amg_data) ((amg_data)->D_inv) @@ -454,6 +470,11 @@ #define hypre_ParAMGDataRAP2(amg_data) ((amg_data)->rap2) #define hypre_ParAMGDataKeepTranspose(amg_data) ((amg_data)->keepTranspose) + +/*indeces for the dof which will keep coarsening to the coarse level */ +#define hypre_ParAMGDataCPointKeepMarkerArray(amg_data) ((amg_data)-> C_point_marker_array) +#define hypre_ParAMGDataCPointKeepLevel(amg_data) ((amg_data)-> C_point_keep_level) +#define hypre_ParAMGDataNumCPointKeep(amg_data) ((amg_data)-> num_C_point_marker) #endif @@ -532,6 +553,7 @@ HYPRE_Int hypre_AMGHybridSetNumSweeps ( void *AMGhybrid_vdata , HYPRE_Int num_sweeps ); HYPRE_Int hypre_AMGHybridSetCycleNumSweeps ( void *AMGhybrid_vdata , HYPRE_Int num_sweeps , HYPRE_Int k ); HYPRE_Int hypre_AMGHybridSetRelaxType ( void *AMGhybrid_vdata , HYPRE_Int relax_type ); +HYPRE_Int hypre_AMGHybridSetSplittingStrategy( void *AMGhybrid_vdata , HYPRE_Int splitting_strategy ); HYPRE_Int hypre_AMGHybridSetCycleRelaxType ( void *AMGhybrid_vdata , HYPRE_Int relax_type , HYPRE_Int k ); HYPRE_Int hypre_AMGHybridSetRelaxOrder ( void *AMGhybrid_vdata , HYPRE_Int relax_order ); HYPRE_Int hypre_AMGHybridSetMaxCoarseSize ( void *AMGhybrid_vdata , HYPRE_Int max_coarse_size ); @@ -839,6 +861,9 @@ HYPRE_Int HYPRE_BoomerAMGSetCoordinates ( HYPRE_Solver solver , float *coordinates ); HYPRE_Int HYPRE_BoomerAMGSetChebyOrder ( HYPRE_Solver solver , HYPRE_Int order ); HYPRE_Int HYPRE_BoomerAMGSetChebyFraction ( HYPRE_Solver solver , HYPRE_Real ratio ); +HYPRE_Int HYPRE_BoomerAMGSetChebyEigEst ( HYPRE_Solver solver , HYPRE_Int eig_est ); +HYPRE_Int HYPRE_BoomerAMGSetChebyVariant ( HYPRE_Solver solver , HYPRE_Int variant ); +HYPRE_Int HYPRE_BoomerAMGSetChebyScale ( HYPRE_Solver solver , HYPRE_Int scale ); HYPRE_Int HYPRE_BoomerAMGSetInterpVectors ( HYPRE_Solver solver , HYPRE_Int num_vectors , HYPRE_ParVector *vectors ); HYPRE_Int HYPRE_BoomerAMGSetInterpVecVariant ( HYPRE_Solver solver , HYPRE_Int num ); HYPRE_Int HYPRE_BoomerAMGSetInterpVecQMax ( HYPRE_Solver solver , HYPRE_Int q_max ); @@ -852,11 +877,13 @@ HYPRE_Int HYPRE_BoomerAMGGetMultAdditive ( HYPRE_Solver solver , HYPRE_Int *mult_additive ); HYPRE_Int HYPRE_BoomerAMGSetSimple ( HYPRE_Solver solver , HYPRE_Int simple ); HYPRE_Int HYPRE_BoomerAMGGetSimple ( HYPRE_Solver solver , HYPRE_Int *simple ); +HYPRE_Int HYPRE_BoomerAMGSetAddLastLvl ( HYPRE_Solver solver , HYPRE_Int add_last_lvl ); HYPRE_Int HYPRE_BoomerAMGSetNonGalerkinTol ( HYPRE_Solver solver , HYPRE_Real nongalerkin_tol ); HYPRE_Int HYPRE_BoomerAMGSetLevelNonGalerkinTol ( HYPRE_Solver solver , HYPRE_Real nongalerkin_tol , HYPRE_Int level ); HYPRE_Int HYPRE_BoomerAMGSetNonGalerkTol ( HYPRE_Solver solver , HYPRE_Int nongalerk_num_tol , HYPRE_Real *nongalerk_tol ); HYPRE_Int HYPRE_BoomerAMGSetRAP2 ( HYPRE_Solver solver , HYPRE_Int rap2 ); HYPRE_Int HYPRE_BoomerAMGSetKeepTranspose ( HYPRE_Solver solver , HYPRE_Int keepTranspose ); +HYPRE_Int HYPRE_BoomerAMGSetCpointsToKeep( HYPRE_Solver solver, HYPRE_Int cpt_coarse_level, HYPRE_Int num_cpt_coarse,HYPRE_Int *cpt_coarse_index); /* HYPRE_parcsr_bicgstab.c */ HYPRE_Int HYPRE_ParCSRBiCGSTABCreate ( MPI_Comm comm , HYPRE_Solver *solver ); @@ -1261,6 +1288,9 @@ HYPRE_Int hypre_BoomerAMGSetEuBJ ( void *data , HYPRE_Int eu_bj ); HYPRE_Int hypre_BoomerAMGSetChebyOrder ( void *data , HYPRE_Int order ); HYPRE_Int hypre_BoomerAMGSetChebyFraction ( void *data , HYPRE_Real ratio ); +HYPRE_Int hypre_BoomerAMGSetChebyEigEst ( void *data , HYPRE_Int eig_est ); +HYPRE_Int hypre_BoomerAMGSetChebyVariant ( void *data , HYPRE_Int variant ); +HYPRE_Int hypre_BoomerAMGSetChebyScale ( void *data , HYPRE_Int scale ); HYPRE_Int hypre_BoomerAMGSetInterpVectors ( void *solver , HYPRE_Int num_vectors , hypre_ParVector **interp_vectors ); HYPRE_Int hypre_BoomerAMGSetInterpVecVariant ( void *solver , HYPRE_Int var ); HYPRE_Int hypre_BoomerAMGSetInterpVecQMax ( void *data , HYPRE_Int q_max ); @@ -1274,11 +1304,13 @@ HYPRE_Int hypre_BoomerAMGGetMultAdditive ( void *data , HYPRE_Int *mult_additive ); HYPRE_Int hypre_BoomerAMGSetSimple ( void *data , HYPRE_Int simple ); HYPRE_Int hypre_BoomerAMGGetSimple ( void *data , HYPRE_Int *simple ); +HYPRE_Int hypre_BoomerAMGSetAddLastLvl ( void *data , HYPRE_Int add_last_lvl ); HYPRE_Int hypre_BoomerAMGSetNonGalerkinTol ( void *data , HYPRE_Real nongalerkin_tol ); HYPRE_Int hypre_BoomerAMGSetLevelNonGalerkinTol ( void *data , HYPRE_Real nongalerkin_tol , HYPRE_Int level ); HYPRE_Int hypre_BoomerAMGSetNonGalerkTol ( void *data , HYPRE_Int nongalerk_num_tol , HYPRE_Real *nongalerk_tol ); HYPRE_Int hypre_BoomerAMGSetRAP2 ( void *data , HYPRE_Int rap2 ); HYPRE_Int hypre_BoomerAMGSetKeepTranspose ( void *data , HYPRE_Int keepTranspose ); +HYPRE_Int hypre_BoomerAMGSetCpointsToKeep(void *data, HYPRE_Int cpt_coarse_level, HYPRE_Int num_cpt_coarse, HYPRE_Int *cpt_coarse_index); /* par_amg_setup.c */ HYPRE_Int hypre_BoomerAMGSetup ( void *amg_vdata , hypre_ParCSRMatrix *A , hypre_ParVector *f , hypre_ParVector *u ); @@ -1303,6 +1335,10 @@ HYPRE_Int hypre_BoomerAMGCGRelaxWt ( void *amg_vdata , HYPRE_Int level , HYPRE_Int num_cg_sweeps , HYPRE_Real *rlx_wt_ptr ); HYPRE_Int hypre_Bisection ( HYPRE_Int n , HYPRE_Real *diag , HYPRE_Real *offd , HYPRE_Real y , HYPRE_Real z , HYPRE_Real tol , HYPRE_Int k , HYPRE_Real *ev_ptr ); +/* par_cheby.c */ +HYPRE_Int hypre_ParCSRRelax_Cheby_Setup ( hypre_ParCSRMatrix *A , HYPRE_Real max_eig , HYPRE_Real min_eig , HYPRE_Real fraction , HYPRE_Int order , HYPRE_Int scale , HYPRE_Int variant , HYPRE_Real **coefs_ptr , HYPRE_Real **ds_ptr ); +HYPRE_Int hypre_ParCSRRelax_Cheby_Solve ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Real *ds_data , HYPRE_Real *coefs , HYPRE_Int order , HYPRE_Int scale , HYPRE_Int variant , hypre_ParVector *u , hypre_ParVector *v , hypre_ParVector *r ); + /* par_coarsen.c */ HYPRE_Int hypre_BoomerAMGCoarsen ( hypre_ParCSRMatrix *S , hypre_ParCSRMatrix *A , HYPRE_Int CF_init , HYPRE_Int debug_flag , HYPRE_Int **CF_marker_ptr ); HYPRE_Int hypre_BoomerAMGCoarsenRuge ( hypre_ParCSRMatrix *S , hypre_ParCSRMatrix *A , HYPRE_Int measure_type , HYPRE_Int coarsen_type , HYPRE_Int debug_flag , HYPRE_Int **CF_marker_ptr ); @@ -1408,6 +1444,7 @@ HYPRE_Int hypre_IntersectTwoArrays ( HYPRE_Int *x , HYPRE_Real *x_data , HYPRE_Int x_length , HYPRE_Int *y , HYPRE_Int y_length , HYPRE_Int *z , HYPRE_Real *output_x_data , HYPRE_Int *intersect_length ); HYPRE_Int hypre_SortedCopyParCSRData ( hypre_ParCSRMatrix *A , hypre_ParCSRMatrix *B ); HYPRE_Int hypre_BoomerAMG_MyCreateS ( hypre_ParCSRMatrix *A , HYPRE_Real strength_threshold , HYPRE_Real max_row_sum , HYPRE_Int num_functions , HYPRE_Int *dof_func , hypre_ParCSRMatrix **S_ptr ); +HYPRE_Int hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix *A, HYPRE_Real strength_threshold, HYPRE_Real max_row_sum, HYPRE_Int *CF_marker, HYPRE_Int SMRK, hypre_ParCSRMatrix **S_ptr); HYPRE_Int hypre_NonGalerkinIJBufferInit ( HYPRE_Int *ijbuf_cnt , HYPRE_Int *ijbuf_rowcounter , HYPRE_Int *ijbuf_numcols ); HYPRE_Int hypre_NonGalerkinIJBufferNewRow ( HYPRE_Int *ijbuf_rownums , HYPRE_Int *ijbuf_numcols , HYPRE_Int *ijbuf_rowcounter , HYPRE_Int new_row ); HYPRE_Int hypre_NonGalerkinIJBufferCompressRow ( HYPRE_Int *ijbuf_cnt , HYPRE_Int ijbuf_rowcounter , HYPRE_Real *ijbuf_data , HYPRE_Int *ijbuf_cols , HYPRE_Int *ijbuf_rownums , HYPRE_Int *ijbuf_numcols ); @@ -1430,7 +1467,7 @@ HYPRE_Int hypre_BoomerAMGRelax ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Int *cf_marker , HYPRE_Int relax_type , HYPRE_Int relax_points , HYPRE_Real relax_weight , HYPRE_Real omega , HYPRE_Real *l1_norms , hypre_ParVector *u , hypre_ParVector *Vtemp , hypre_ParVector *Ztemp ); HYPRE_Int hypre_GaussElimSetup ( hypre_ParAMGData *amg_data , HYPRE_Int level , HYPRE_Int relax_type ); HYPRE_Int hypre_GaussElimSolve ( hypre_ParAMGData *amg_data , HYPRE_Int level , HYPRE_Int relax_type ); -HYPRE_Int gselim ( HYPRE_Real *A , HYPRE_Real *x , HYPRE_Int n ); +HYPRE_CUDA_GLOBAL HYPRE_Int gselim ( HYPRE_Real *A , HYPRE_Real *x , HYPRE_Int n ); /* par_relax_interface.c */ HYPRE_Int hypre_BoomerAMGRelaxIF ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Int *cf_marker , HYPRE_Int relax_type , HYPRE_Int relax_order , HYPRE_Int cycle_type , HYPRE_Real relax_weight , HYPRE_Real omega , HYPRE_Real *l1_norms , hypre_ParVector *u , hypre_ParVector *Vtemp , hypre_ParVector *Ztemp ); @@ -1509,6 +1546,19 @@ HYPRE_Real rfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); HYPRE_Real bndfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +/* par_vardifconv_rs.c */ +HYPRE_ParCSRMatrix GenerateRSVarDifConv ( MPI_Comm comm , HYPRE_Int nx , HYPRE_Int ny , HYPRE_Int nz , HYPRE_Int P , HYPRE_Int Q , HYPRE_Int R , HYPRE_Int p , HYPRE_Int q , HYPRE_Int r , HYPRE_Real eps , HYPRE_ParVector *rhs_ptr, HYPRE_Int type ); +HYPRE_Real afun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real bfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real cfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real dfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real efun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real ffun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real gfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real rfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); +HYPRE_Real bndfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz ); + + /* pcg_par.c */ char *hypre_ParKrylovCAlloc ( HYPRE_Int count , HYPRE_Int elt_size ); HYPRE_Int hypre_ParKrylovFree ( char *ptr ); @@ -1552,6 +1602,59 @@ HYPRE_Int hypre_ParGenerateScale ( hypre_ParCSRMatrix *A , hypre_CSRMatrix *domain_structure , HYPRE_Real relaxation_weight , HYPRE_Real **scale_pointer ); HYPRE_Int hypre_ParGenerateHybridScale ( hypre_ParCSRMatrix *A , hypre_CSRMatrix *domain_structure , hypre_CSRMatrix **A_boundary_pointer , HYPRE_Real **scale_pointer ); +/* par_mgr.c */ +void *hypre_MGRCreate ( void ); +HYPRE_Int hypre_MGRDestroy ( void *mgr_vdata ); +HYPRE_Int hypre_MGRCycle( void *mgr_vdata, hypre_ParVector **F_array, hypre_ParVector **U_array ); +void *hypre_MGRCreateFrelaxVcycleData(); +HYPRE_Int hypre_MGRDestroyFrelaxVcycleData( void *mgr_vdata ); +HYPRE_Int hypre_MGRSetupFrelaxVcycleData( void *mgr_vdata, hypre_ParCSRMatrix *A, hypre_ParVector *f, hypre_ParVector *u, HYPRE_Int level); +HYPRE_Int hypre_MGRFrelaxVcycle ( void *mgr_vdata ); +HYPRE_Int hypre_MGRSetCpointsByBlock( void *mgr_vdata, HYPRE_Int block_size, HYPRE_Int max_num_levels, HYPRE_Int *block_num_coarse_points, HYPRE_Int **block_coarse_indexes); +HYPRE_Int hypre_MGRCoarsen(hypre_ParCSRMatrix *S, hypre_ParCSRMatrix *A,HYPRE_Int final_coarse_size,HYPRE_Int *final_coarse_indexes,HYPRE_Int debug_flag,HYPRE_Int **CF_marker,HYPRE_Int last_level); +HYPRE_Int hypre_MGRSetReservedCoarseNodes(void *mgr_vdata, HYPRE_Int reserved_coarse_size, HYPRE_Int *reserved_coarse_nodes); +HYPRE_Int hypre_MGRSetMaxGlobalsmoothIters( void *mgr_vdata, HYPRE_Int max_iter ); +HYPRE_Int hypre_MGRSetGlobalsmoothType( void *mgr_vdata, HYPRE_Int iter_type ); +HYPRE_Int hypre_MGRSetNonCpointsToFpoints( void *mgr_vdata, HYPRE_Int nonCptToFptFlag); + +//HYPRE_Int hypre_MGRInitCFMarker(HYPRE_Int num_variables, HYPRE_Int *CF_marker, HYPRE_Int initial_coarse_size,HYPRE_Int *initial_coarse_indexes); +//HYPRE_Int hypre_MGRUpdateCoarseIndexes(HYPRE_Int num_variables, HYPRE_Int *CF_marker, HYPRE_Int initial_coarse_size,HYPRE_Int *initial_coarse_indexes); +HYPRE_Int hypre_MGRBuildInterp(hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker, hypre_ParCSRMatrix *S, HYPRE_Int *num_cpts_global, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int debug_flag, HYPRE_Real trunc_factor, HYPRE_Int max_elmts, HYPRE_Int *col_offd_S_to_A, hypre_ParCSRMatrix **P, HYPRE_Int last_level, HYPRE_Int level, HYPRE_Int numsweeps); +//HYPRE_Int hypre_MGRBuildRestrictionToper(hypre_ParCSRMatrix *AT, HYPRE_Int *CF_marker, hypre_ParCSRMatrix *ST, HYPRE_Int *num_cpts_global,HYPRE_Int num_functions,HYPRE_Int *dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor, HYPRE_Int max_elmts, HYPRE_Int *col_offd_ST_to_AT,hypre_ParCSRMatrix **RT,HYPRE_Int last_level,HYPRE_Int level, HYPRE_Int numsweeps); +//HYPRE_Int hypre_BoomerAMGBuildInjectionInterp( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker, HYPRE_Int *num_cpts_global, HYPRE_Int num_functions, HYPRE_Int debug_flag,HYPRE_Int init_data,hypre_ParCSRMatrix **P_ptr); +HYPRE_Int hypre_MGRSetCoarseSolver( void *mgr_vdata, HYPRE_Int (*coarse_grid_solver_solve)(void*,void*,void*,void*), HYPRE_Int (*coarse_grid_solver_setup)(void*,void*,void*,void*), void *coarse_grid_solver ); +HYPRE_Int hypre_MGRSetup( void *mgr_vdata, hypre_ParCSRMatrix *A, hypre_ParVector *f, hypre_ParVector *u ); +HYPRE_Int hypre_MGRSolve( void *mgr_vdata, hypre_ParCSRMatrix *A, hypre_ParVector *f, hypre_ParVector *u ); +HYPRE_Int hypre_block_jacobi_scaling(hypre_ParCSRMatrix *A,hypre_ParCSRMatrix **B_ptr,void *mgr_vdata,HYPRE_Int debug_flag); +HYPRE_Int hypre_block_jacobi (hypre_ParCSRMatrix *A,hypre_ParVector *f,hypre_ParVector *u,HYPRE_Real blk_size,HYPRE_Int n_block,HYPRE_Int left_size,HYPRE_Real *diaginv,hypre_ParVector *Vtemp); +HYPRE_Int hypre_blockRelax_setup(hypre_ParCSRMatrix *A,HYPRE_Int blk_size, HYPRE_Int reserved_coarse_size, HYPRE_Real **diaginvptr); +HYPRE_Int hypre_blockRelax(hypre_ParCSRMatrix *A,hypre_ParVector *f,hypre_ParVector *u,HYPRE_Int blk_size,HYPRE_Int reserved_coarse_size,hypre_ParVector *Vtemp,hypre_ParVector *Ztemp); + +HYPRE_Int hypre_MGRBuildAff( MPI_Comm comm, HYPRE_Int local_num_variables, HYPRE_Int num_functions, +HYPRE_Int *dof_func, HYPRE_Int *CF_marker, HYPRE_Int **coarse_dof_func_ptr, HYPRE_Int **coarse_pnts_global_ptr, +hypre_ParCSRMatrix *A, HYPRE_Int debug_flag, hypre_ParCSRMatrix **P_f_ptr, hypre_ParCSRMatrix **A_ff_ptr ); + +HYPRE_Int hypre_MGRSetAffSolverType( void *systg_vdata, HYPRE_Int *aff_solver_type ); +HYPRE_Int hypre_MGRSetCoarseSolverType( void *systg_vdata, HYPRE_Int coarse_solver_type ); +HYPRE_Int hypre_MGRSetCoarseSolverIter( void *systg_vdata, HYPRE_Int coarse_solver_iter ); +HYPRE_Int hypre_MGRSetFineSolverIter( void *systg_vdata, HYPRE_Int fine_solver_iter ); +HYPRE_Int hypre_MGRSetFineSolverMaxLevels( void *systg_vdata, HYPRE_Int fine_solver_max_levels ); +HYPRE_Int hypre_MGRSetMaxCoarseLevels( void *mgr_vdata, HYPRE_Int maxlev ); +HYPRE_Int hypre_MGRSetBlockSize( void *mgr_vdata, HYPRE_Int bsize ); +HYPRE_Int hypre_MGRSetRelaxType( void *mgr_vdata, HYPRE_Int relax_type ); +HYPRE_Int hypre_MGRSetFRelaxMethod( void *mgr_vdata, HYPRE_Int relax_method); +HYPRE_Int hypre_MGRSetRestrictType( void *mgr_vdata, HYPRE_Int interpType); +HYPRE_Int hypre_MGRSetInterpType( void *mgr_vdata, HYPRE_Int interpType); +HYPRE_Int hypre_MGRSetNumRelaxSweeps( void *mgr_vdata, HYPRE_Int nsweeps ); +HYPRE_Int hypre_MGRSetNumInterpSweeps( void *mgr_vdata, HYPRE_Int nsweeps ); +HYPRE_Int hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level ); +HYPRE_Int hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging ); +HYPRE_Int hypre_MGRSetMaxIter( void *mgr_vdata, HYPRE_Int max_iter ); +HYPRE_Int hypre_MGRSetTol( void *mgr_vdata, HYPRE_Real tol ); +// Accessor functions +HYPRE_Int hypre_MGRGetNumIterations( void *mgr_vdata, HYPRE_Int *num_iterations ); +HYPRE_Int hypre_MGRGetFinalRelativeResidualNorm( void *mgr_vdata, HYPRE_Real *res_norm ); + #ifdef __cplusplus } #endif diff -Nru hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_ls.h hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_ls.h --- hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_ls.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_ls.h 2017-10-20 17:42:22.000000000 +0000 @@ -274,7 +274,7 @@ * \hline * \end{tabular} * - * The default is 6. + * The default is 10. **/ HYPRE_Int HYPRE_BoomerAMGSetCoarsenType(HYPRE_Solver solver, HYPRE_Int coarsen_type); @@ -406,21 +406,21 @@ * \hline * \end{tabular} * - * The default is 0. + * The default is ext+i interpolation (interp_type 6) trunctated to at most 4 \\ + * elements per row. (see HYPRE_BoomerAMGSetPMaxElmts). **/ HYPRE_Int HYPRE_BoomerAMGSetInterpType(HYPRE_Solver solver, HYPRE_Int interp_type); /** - * (Optional) Defines a truncation factor for the interpolation. - * The default is 0. + * (Optional) Defines a truncation factor for the interpolation. The default is 0. **/ HYPRE_Int HYPRE_BoomerAMGSetTruncFactor(HYPRE_Solver solver, HYPRE_Real trunc_factor); /** * (Optional) Defines the maximal number of elements per row for the interpolation. - * The default is 0. + * The default is 4. To turn off truncation, it needs to be set to 0. **/ HYPRE_Int HYPRE_BoomerAMGSetPMaxElmts(HYPRE_Solver solver, HYPRE_Int P_max_elmts); @@ -582,6 +582,16 @@ HYPRE_Int addlvl); /** + * (Optional) Defines last level where additive, mult-additive + * or simple cycle is used. + * The multiplicative approach is used on levels > add_last_lvl. + * + * Can only be used when AMG is used as a preconditioner !!! + **/ +HYPRE_Int HYPRE_BoomerAMGSetAddLastLvl(HYPRE_Solver solver, + HYPRE_Int add_last_lvl); + +/** * (Optional) Defines the truncation factor for the * smoothed interpolation used for mult-additive or simple method. * The default is 0. @@ -676,7 +686,8 @@ * (Optional) Defines the smoother to be used. It uses the given * smoother on the fine grid, the up and * the down cycle and sets the solver on the coarsest level to Gaussian - * elimination (9). The default is Gauss-Seidel (3). + * elimination (9). The default is $\ell_1$-Gauss-Seidel, forward solve (13) + * on the down cycle and backward solve (14) on the up cycle. * * There are the following options for relax\_type: * @@ -690,6 +701,8 @@ * 6 & hybrid symmetric Gauss-Seidel or SSOR \\ * 8 & $\ell_1$-scaled hybrid symmetric Gauss-Seidel\\ * 9 & Gaussian elimination (only on coarsest level) \\ + * 13 & $\ell_1$ Gauss-Seidel, forward solve \\ + * 14 & $\ell_1$ Gauss-Seidel, backward solve \\ * 15 & CG (warning - not a fixed smoother - may require FGMRES)\\ * 16 & Chebyshev\\ * 17 & FCF-Jacobi\\ @@ -733,7 +746,7 @@ * \hline * \end{tabular} * - * The default is 1 (CF-relaxation). + * The default is 0. **/ HYPRE_Int HYPRE_BoomerAMGSetRelaxOrder(HYPRE_Solver solver, HYPRE_Int relax_order); @@ -838,6 +851,31 @@ HYPRE_Int HYPRE_BoomerAMGSetChebyFraction (HYPRE_Solver solver, HYPRE_Real ratio); +/* + * (Optional) Defines whether matrix should be scaled. + * The default is 1 (i.e., scaled). + **/ +HYPRE_Int HYPRE_BoomerAMGSetChebyScale (HYPRE_Solver solver, + HYPRE_Int scale); +/* + * (Optional) Defines which polynomial variant should be used. + * The default is 0 (i.e., scaled). + **/ +HYPRE_Int HYPRE_BoomerAMGSetChebyVariant (HYPRE_Solver solver, + HYPRE_Int variant); + +/* + * (Optional) Defines how to estimate eigenvalues. + * The default is 10 (i.e., 10 CG iterations are used to find extreme + * eigenvalues.) If eig_est=0, the largest eigenvalue is estimated + * using Gershgorin, the smallest is set to 0. + * If eig_est is a positive number n, n iterations of CG are used to + * determine the smallest and largest eigenvalue. + **/ +HYPRE_Int HYPRE_BoomerAMGSetChebyEigEst (HYPRE_Solver solver, + HYPRE_Int eig_est); + + /** * (Optional) Enables the use of more complex smoothers. * The following options exist for smooth\_type: @@ -1107,6 +1145,19 @@ HYPRE_Int HYPRE_BoomerAMGSetCoordinates (HYPRE_Solver solver, float *coordinates); +/** + * (Optional) Fix C points to be kept till a specified coarse level. + * + * @param solver [IN] solver or preconditioner + * @param cpt_coarse_level [IN] coarse level up to which to keep C points + * @param num_cpt_coarse [IN] number of C points to be kept + * @param cpt_coarse_index [IN] indexes of C points to be kept + **/ +HYPRE_Int HYPRE_BoomerAMGSetCpointsToKeep(HYPRE_Solver solver, + HYPRE_Int cpt_coarse_level, + HYPRE_Int num_cpt_coarse, + HYPRE_Int *cpt_coarse_index); + /*@}*/ /*-------------------------------------------------------------------------- @@ -2578,16 +2629,18 @@ HYPRE_Int logging); /** - * Set print level (default: 0, no printing). + * Set print level (default: 0, no printing) + * 2 will print residual norms per iteration + * 10 will print AMG setup information if AMG is used + * 12 both Setup information and iterations. **/ HYPRE_Int HYPRE_ParCSRHybridSetPrintLevel(HYPRE_Solver solver, HYPRE_Int print_level); /** * (Optional) Sets AMG strength threshold. The default is 0.25. - * For 2d Laplace operators, 0.25 is a good value, for 3d Laplace - * operators, 0.5 or 0.6 is a better value. For elasticity problems, - * a large strength threshold, such as 0.9, is often better. + * For elasticity problems, a larger strength threshold, such as 0.7 or 0.8, + * is often better. **/ HYPRE_Int HYPRE_ParCSRHybridSetStrongThreshold(HYPRE_Solver solver, @@ -2655,7 +2708,7 @@ * \hline * \end{tabular} * - * The default is 6. + * The default is 10. **/ HYPRE_Int HYPRE_ParCSRHybridSetCoarsenType(HYPRE_Solver solver, @@ -2663,7 +2716,7 @@ /* * (Optional) Specifies which interpolation operator is used - * The default is modified ''classical" interpolation. + * The default is ext+i interpolation truncated to at most 4 elements per row. **/ HYPRE_Int HYPRE_ParCSRHybridSetInterpType(HYPRE_Solver solver, @@ -2721,7 +2774,8 @@ * (Optional) Defines the smoother to be used. It uses the given * smoother on the fine grid, the up and * the down cycle and sets the solver on the coarsest level to Gaussian - * elimination (9). The default is Gauss-Seidel (3). + * elimination (9). The default is l1-Gauss-Seidel, forward solve on the down + * cycle (13) and backward solve on the up cycle (14). * * There are the following options for relax\_type: * @@ -2731,8 +2785,11 @@ * 2 & Gauss-Seidel, interior points in parallel, boundary sequential (slow!) \\ * 3 & hybrid Gauss-Seidel or SOR, forward solve \\ * 4 & hybrid Gauss-Seidel or SOR, backward solve \\ - * 5 & hybrid chaotic Gauss-Seidel (works only with OpenMP) \\ * 6 & hybrid symmetric Gauss-Seidel or SSOR \\ + * 8 & hybrid symmetric l1-Gauss-Seidel or SSOR \\ + * 13 & l1-Gauss-Seidel, forward solve \\ + * 14 & l1-Gauss-Seidel, backward solve \\ + * 18 & l1-Jacobi \\ * 9 & Gaussian elimination (only on coarsest level) \\ * \hline * \end{tabular} @@ -2775,7 +2832,7 @@ * \hline * \end{tabular} * - * The default is 1 (CF-relaxation). + * The default is 0 (CF-relaxation). **/ HYPRE_Int HYPRE_ParCSRHybridSetRelaxOrder(HYPRE_Solver solver, @@ -3056,6 +3113,353 @@ HYPRE_Real *norm); /*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +/** + * @name ParCSR MGR Solver + * + * Parallel multigrid reduction solver and preconditioner. + * This solver or preconditioner is designed with systems of + * PDEs in mind. However, it can also be used for scalar linear + * systems, particularly for problems where the user can exploit + * information from the physics of the problem. In this way, the + * MGR solver could potentially be used as a foundation + * for a physics-based preconditioner. + **/ +/*@{*/ + +/** + * Create a solver object + **/ +HYPRE_Int HYPRE_MGRCreate( HYPRE_Solver *solver ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRDestroy + *--------------------------------------------------------------------------*/ +/** + * Destroy a solver object + **/ +HYPRE_Int HYPRE_MGRDestroy( HYPRE_Solver solver ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetup + *--------------------------------------------------------------------------*/ +/** + * Setup the MGR solver or preconditioner. + * If used as a preconditioner, this function should be passed + * to the iterative solver {\tt SetPrecond} function. + * + * @param solver [IN] object to be set up. + * @param A [IN] ParCSR matrix used to construct the solver/preconditioner. + * @param b right-hand-side of the linear system to be solved (Ignored by this function). + * @param x approximate solution of the linear system to be solved (Ignored by this function). + **/ +HYPRE_Int HYPRE_MGRSetup( HYPRE_Solver solver, + HYPRE_ParCSRMatrix A, + HYPRE_ParVector b, + HYPRE_ParVector x ); +/*-------------------------------------------------------------------------- + * HYPRE_MGRSolve + *--------------------------------------------------------------------------*/ + /** + * Solve the system or apply MGR as a preconditioner. + * If used as a preconditioner, this function should be passed + * to the iterative solver {\tt SetPrecond} function. + * + * @param solver [IN] solver or preconditioner object to be applied. + * @param A [IN] ParCSR matrix, matrix of the linear system to be solved + * @param b [IN] right hand side of the linear system to be solved + * @param x [OUT] approximated solution of the linear system to be solved + **/ +HYPRE_Int HYPRE_MGRSolve( HYPRE_Solver solver, + HYPRE_ParCSRMatrix A, + HYPRE_ParVector b, + HYPRE_ParVector x ); + +/*-------------------------------------------------------------------------- + * HYPRE_Int HYPRE_MGRSetCpointsByBlock + *--------------------------------------------------------------------------*/ +/** + * Set the block data and prescribe the coarse indexes per block + * for each reduction level. + * + * @param solver [IN] solver or preconditioner object + * @param block_size [IN] system block size + * @param max_num_levels [IN] maximum number of reduction levels + * @param num_block_coarse_points [IN] number of coarse points per block per level + * @param block_coarse_indexes [IN] index for each block coarse point per level + **/ +HYPRE_Int HYPRE_MGRSetCpointsByBlock( HYPRE_Solver solver, + HYPRE_Int block_size, + HYPRE_Int max_num_levels, + HYPRE_Int *num_block_coarse_points, + HYPRE_Int **block_coarse_indexes); + +/** + * (Optional) Set non C-points to F-points. + * This routine determines how the coarse points are selected for the next level + * reduction. Options for {\tt nonCptToFptFlag} are: + * + * \begin{tabular}{|c|l|} \hline + * 0 & Allow points not prescribed as C points to be potentially set as C points \\ + * & using classical AMG coarsening strategies (currently uses CLJP-coarsening). \\ + * 1 & Fix points not prescribed as C points to be F points for the next reduction \\ + * \hline + * \end{tabular} + * + **/ +HYPRE_Int +HYPRE_MGRSetNonCpointsToFpoints( HYPRE_Solver solver, HYPRE_Int nonCptToFptFlag); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetMaxCoarseLevels + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set maximum number of coarsening (or reduction) levels. + * The default is 10. + **/ +HYPRE_Int +HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver, HYPRE_Int maxlev ); +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetBlockSize + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the system block size. + * This should match the block size set in the MGRSetCpointsByBlock function. + * The default is 1. + **/ +HYPRE_Int +HYPRE_MGRSetBlockSize( HYPRE_Solver solver, HYPRE_Int bsize ); +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetReservedCoarseNodes + *--------------------------------------------------------------------------*/ +/** + * (Optional) Defines indexes of coarse nodes to be kept to the coarsest level. + * These indexes are passed down through the MGR hierarchy to the coarsest grid + * of the coarse grid (BoomerAMG) solver. + * + * @param solver [IN] solver or preconditioner object + * @param reserved_coarse_size [IN] number of reserved coarse points + * @param reserved_coarse_nodes [IN] (global) indexes of reserved coarse points + **/ +HYPRE_Int +HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver, HYPRE_Int reserved_coarse_size, HYPRE_Int *reserved_coarse_nodes ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetRelaxType + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the relaxation type for F-relaxation. + * Currently supports the following flavors of relaxation types + * as described in the {\tt BoomerAMGSetRelaxType}: + * relax\_types 0 - 8, 13, 14, 18, 19, 98. + **/ +HYPRE_Int +HYPRE_MGRSetRelaxType(HYPRE_Solver solver, HYPRE_Int relax_type ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetRelaxMethod + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the strategy for F-relaxation. + * Options for {\tt relax\_method} are: + * + * \begin{tabular}{|c|l|} \hline + * 0 & Single-level relaxation sweeps for F-relaxation as prescribed by {\tt MGRSetRelaxType} \\ + * 1 & Multi-level relaxation strategy for F-relaxation (V(1,0) cycle currently supported). \\ + * \hline + * \end{tabular} + **/ +HYPRE_Int +HYPRE_MGRSetFRelaxMethod(HYPRE_Solver solver, HYPRE_Int relax_method ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetRestrictType + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the strategy for computing the MGR restriction operator. + * + * Options for {\tt restrict\_type} are: + * + * \begin{tabular}{|c|l|} \hline + * 0 & injection $[0 I]$ \\ + * 1 & unscaled (not recommended) \\ + * 2 & diagonal scaling (Jacobi) \\ + * else & use classical modified interpolation \\ + * \hline + * \end{tabular} + * + * These options are currently active for the last stage reduction. Intermediate + * reduction levels use injection. The default is injection. + **/ +HYPRE_Int +HYPRE_MGRSetRestrictType( HYPRE_Solver solver, HYPRE_Int restrict_type); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetInterpType + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the strategy for computing the MGR restriction operator. + * Options for {\tt interp\_type} are: + * + * \begin{tabular}{|c|l|} \hline + * 0 & injection $[0 I]^{T}$ \\ + * 1 & unscaled (not recommended) \\ + * 2 & diagonal scaling (Jacobi) \\ + * else & use default (classical modified interpolation) \\ + * \hline + * \end{tabular} + * + * These options are currently active for the last stage reduction. Intermediate + * reduction levels use diagonal scaling. + **/ +HYPRE_Int +HYPRE_MGRSetInterpType( HYPRE_Solver solver, HYPRE_Int interp_type ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetNumRelaxSweeps + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set number of relaxation sweeps. + * This option is for the `single level' F-relaxation (relax\_method = 0). + **/ +HYPRE_Int +HYPRE_MGRSetNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetNumInterpSweeps + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set number of interpolation sweeps. + * This option is for interp\_type > 2. + **/ +HYPRE_Int +HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetCoarseSolver + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the coarse grid solver. + * Currently uses BoomerAMG. + * The default, if not set, is BoomerAMG with default options. + * + * @param solver [IN] solver or preconditioner object + * @param coarse_grid_solver_solve [IN] solve routine for BoomerAMG + * @param coarse_grid_solver_setup [IN] setup routine for BoomerAMG + * @param coarse_grid_solver [IN] BoomerAMG solver + **/ +HYPRE_Int HYPRE_MGRSetCoarseSolver(HYPRE_Solver solver, + HYPRE_PtrToParSolverFcn coarse_grid_solver_solve, + HYPRE_PtrToParSolverFcn coarse_grid_solver_setup, + HYPRE_Solver coarse_grid_solver ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetPrintLevel + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the print level to print setup and solve information. + * + * \begin{tabular}{|c|l|} \hline + * 0 & no printout (default) \\ + * 1 & print setup information \\ + * 2 & print solve information \\ + * 3 & print both setup and solve information \\ + * \hline + * \end{tabular} + **/ +HYPRE_Int +HYPRE_MGRSetPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetLogging + *--------------------------------------------------------------------------*/ +/** + * (Optional) Requests logging of solver diagnostics. + * Requests additional computations for diagnostic and similar + * data to be logged by the user. Default to 0 for do nothing. The latest + * residual will be available if logging > 1. + **/ +HYPRE_Int +HYPRE_MGRSetLogging( HYPRE_Solver solver, HYPRE_Int logging ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetMaxIter + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set maximum number of iterations if used as a solver. + * Set this to 1 if MGR is used as a preconditioner. The default is 20. + **/ +HYPRE_Int +HYPRE_MGRSetMaxIter( HYPRE_Solver solver, HYPRE_Int max_iter ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetTol + *--------------------------------------------------------------------------*/ +/** + * (Optional) Set the convergence tolerance for the MGR solver. + * Use tol = 0.0 if MGR is used as a preconditioner. The default is 1.e-7. + **/ +HYPRE_Int +HYPRE_MGRSetTol( HYPRE_Solver solver, HYPRE_Real tol ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetMaxGlobalsmoothIters + *--------------------------------------------------------------------------*/ +/** + * (Optional) Determines how many sweeps of global smoothing to do. + * Default is 0 (no global smoothing). + **/ +HYPRE_Int +HYPRE_MGRSetMaxGlobalsmoothIters( HYPRE_Solver solver, HYPRE_Int smooth_iter ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetGlobalsmoothType + *--------------------------------------------------------------------------*/ +/** + * (Optional) Determines type of global smoother. + * Options for {\tt smooth\_type} are: + * + * \begin{tabular}{|c|l|} \hline + * 0 & block Jacobi (default) \\ + * 1 & Jacobi \\ + * 2 & Gauss-Seidel, sequential (very slow!) \\ + * 3 & Gauss-Seidel, interior points in parallel, boundary sequential (slow!) \\ + * 4 & hybrid Gauss-Seidel or SOR, forward solve \\ + * 5 & hybrid Gauss-Seidel or SOR, backward solve \\ + * 6 & hybrid chaotic Gauss-Seidel (works only with OpenMP) \\ + * 7 & hybrid symmetric Gauss-Seidel or SSOR \\ + * 8 & Euclid (ILU) \\ + * \hline + * \end{tabular} + **/ +HYPRE_Int +HYPRE_MGRSetGlobalsmoothType( HYPRE_Solver solver, HYPRE_Int smooth_type ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRGetNumIterations + *--------------------------------------------------------------------------*/ +/** + * (Optional) Return the number of MGR iterations. + **/ +HYPRE_Int +HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations ); + +/*-------------------------------------------------------------------------- + * HYPRE_MGRGetResidualNorm + *--------------------------------------------------------------------------*/ +/** + * (Optional) Return the norm of the final relative residual. + **/ +HYPRE_Int +HYPRE_MGRGetFinalRelativeResidualNorm( HYPRE_Solver solver, HYPRE_Real *res_norm ); + +/*@}*/ + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +/*-------------------------------------------------------------------------- * Miscellaneous: These probably do not belong in the interface. *--------------------------------------------------------------------------*/ @@ -3129,6 +3533,21 @@ HYPRE_Real eps, HYPRE_ParVector *rhs_ptr); +HYPRE_ParCSRMatrix +GenerateRSVarDifConv(MPI_Comm comm, + HYPRE_Int nx, + HYPRE_Int ny, + HYPRE_Int nz, + HYPRE_Int P, + HYPRE_Int Q, + HYPRE_Int R, + HYPRE_Int p, + HYPRE_Int q, + HYPRE_Int r, + HYPRE_Real eps, + HYPRE_ParVector *rhs_ptr, + HYPRE_Int type); + float* GenerateCoordinates(MPI_Comm comm, HYPRE_Int nx, diff -Nru hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_mgr.c hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_mgr.c --- hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_mgr.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_mgr.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,249 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#include "_hypre_parcsr_ls.h" + +/*-------------------------------------------------------------------------- + * HYPRE_MGRCreate + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRCreate( HYPRE_Solver *solver ) +{ + if (!solver) + { + hypre_error_in_arg(2); + return hypre_error_flag; + } + *solver = ( (HYPRE_Solver) hypre_MGRCreate( ) ); + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRDestroy + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRDestroy( HYPRE_Solver solver ) +{ + return( hypre_MGRDestroy( (void *) solver ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetup + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRSetup( HYPRE_Solver solver, + HYPRE_ParCSRMatrix A, + HYPRE_ParVector b, + HYPRE_ParVector x ) +{ + return( hypre_MGRSetup( (void *) solver, + (hypre_ParCSRMatrix *) A, + (hypre_ParVector *) b, + (hypre_ParVector *) x ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSolve + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRSolve( HYPRE_Solver solver, + HYPRE_ParCSRMatrix A, + HYPRE_ParVector b, + HYPRE_ParVector x ) +{ + return( hypre_MGRSolve( (void *) solver, + (hypre_ParCSRMatrix *) A, + (hypre_ParVector *) b, + (hypre_ParVector *) x ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetCpointsByBlock + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRSetCpointsByBlock( HYPRE_Solver solver, + HYPRE_Int block_size, + HYPRE_Int max_num_levels, + HYPRE_Int *block_num_coarse_points, + HYPRE_Int **block_coarse_indexes) +{ + return( hypre_MGRSetCpointsByBlock( (void *) solver, block_size, max_num_levels, block_num_coarse_points, block_coarse_indexes)); +} + +HYPRE_Int +HYPRE_MGRSetNonCpointsToFpoints( HYPRE_Solver solver, HYPRE_Int nonCptToFptFlag) +{ + return hypre_MGRSetNonCpointsToFpoints((void *) solver, nonCptToFptFlag); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetCoarseSolver + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRSetCoarseSolver(HYPRE_Solver solver, + HYPRE_PtrToParSolverFcn coarse_grid_solver_solve, + HYPRE_PtrToParSolverFcn coarse_grid_solver_setup, + HYPRE_Solver coarse_grid_solver ) +{ + return( hypre_MGRSetCoarseSolver( (void *) solver, + (HYPRE_Int (*)(void*, void*, void*, void*)) coarse_grid_solver_solve, + (HYPRE_Int (*)(void*, void*, void*, void*)) coarse_grid_solver_setup, + (void *) coarse_grid_solver ) ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetMaxLevels + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver, HYPRE_Int maxlev ) +{ + return hypre_MGRSetMaxCoarseLevels(solver, maxlev); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetBlockSize + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetBlockSize( HYPRE_Solver solver, HYPRE_Int bsize ) +{ + return hypre_MGRSetBlockSize(solver, bsize ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetReservedCoarseNodes + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver, HYPRE_Int reserved_coarse_size, HYPRE_Int *reserved_coarse_indexes ) +{ + return hypre_MGRSetReservedCoarseNodes(solver, reserved_coarse_size, reserved_coarse_indexes ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetRestrictType + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetRestrictType(HYPRE_Solver solver, HYPRE_Int restrict_type ) +{ + return hypre_MGRSetRestrictType(solver, restrict_type ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetRelaxMethod + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetFRelaxMethod(HYPRE_Solver solver, HYPRE_Int relax_method ) +{ + return hypre_MGRSetFRelaxMethod(solver, relax_method ); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetRelaxType + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetRelaxType(HYPRE_Solver solver, HYPRE_Int relax_type ) +{ + return hypre_MGRSetRelaxType(solver, relax_type ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetNumRelaxSweeps + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps ) +{ + return hypre_MGRSetNumRelaxSweeps(solver, nsweeps); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetInterpType + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetInterpType( HYPRE_Solver solver, HYPRE_Int interpType ) +{ + return hypre_MGRSetInterpType(solver, interpType); +} + +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetNumInterpSweeps + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps ) +{ + return hypre_MGRSetNumInterpSweeps(solver, nsweeps); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetPrintLevel + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level ) +{ + return hypre_MGRSetPrintLevel( solver, print_level ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetLogging + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetLogging( HYPRE_Solver solver, HYPRE_Int logging ) +{ + return hypre_MGRSetLogging(solver, logging ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetMaxIter + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetMaxIter( HYPRE_Solver solver, HYPRE_Int max_iter ) +{ + return hypre_MGRSetMaxIter( solver, max_iter ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetTol + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetTol( HYPRE_Solver solver, HYPRE_Real tol ) +{ + return hypre_MGRSetTol( solver, tol ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetMaxGlobalsmoothIters + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRSetMaxGlobalsmoothIters( HYPRE_Solver solver, HYPRE_Int max_iter ) +{ + return hypre_MGRSetMaxGlobalsmoothIters(solver, max_iter); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRSetGlobalsmoothType + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_MGRSetGlobalsmoothType( HYPRE_Solver solver, HYPRE_Int iter_type ) +{ + return hypre_MGRSetGlobalsmoothType(solver, iter_type); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRGetNumIterations + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations ) +{ + return hypre_MGRGetNumIterations( solver, num_iterations ); +} +/*-------------------------------------------------------------------------- + * HYPRE_MGRGetFinalRelativeResidualNorm + *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_MGRGetFinalRelativeResidualNorm( HYPRE_Solver solver, HYPRE_Real *res_norm ) +{ + return hypre_MGRGetFinalRelativeResidualNorm(solver, res_norm); +} diff -Nru hypre-2.11.2/src/parcsr_ls/Makefile hypre-2.13.0/src/parcsr_ls/Makefile --- hypre-2.11.2/src/parcsr_ls/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -10,7 +10,6 @@ # $Revision$ #EHEADER********************************************************************** - include ../config/Makefile.config ## Euclid specific macros taken from distributed_ls/Euclid @@ -22,6 +21,8 @@ -I../distributed_ls/Euclid\ -I$(srcdir)\ -I$(srcdir)/..\ + -I$(srcdir)/../blas\ + -I$(srcdir)/../lapack\ -I$(srcdir)/../multivector\ -I$(srcdir)/../utilities\ -I$(srcdir)/../krylov\ @@ -72,6 +73,7 @@ HYPRE_parcsr_lgmres.c\ HYPRE_parcsr_hybrid.c\ HYPRE_parcsr_int.c\ + HYPRE_parcsr_mgr.c\ HYPRE_parcsr_ParaSails.c\ HYPRE_parcsr_pcg.c\ HYPRE_parcsr_pilut.c\ @@ -86,6 +88,7 @@ par_cg_relax_wt.c\ par_coarsen.c\ par_cgc_coarsen.c\ + par_cheby.c\ par_coarse_parms.c\ par_coordinates.c\ par_cr.c\ @@ -101,12 +104,16 @@ par_laplace_9pt.c\ par_laplace.c\ par_lr_interp.c\ + par_mgr.c\ + par_mgr_setup.c\ + par_mgr_solve.c\ par_nongalerkin.c\ par_nodal_systems.c\ par_rap.c\ par_rap_communication.c\ par_rotate_7pt.c\ par_vardifconv.c\ + par_vardifconv_rs.c\ par_relax.c\ par_relax_more.c\ par_relax_interface.c\ diff -Nru hypre-2.11.2/src/parcsr_ls/par_add_cycle.c hypre-2.13.0/src/parcsr_ls/par_add_cycle.c --- hypre-2.11.2/src/parcsr_ls/par_add_cycle.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_add_cycle.c 2017-10-20 17:42:22.000000000 +0000 @@ -38,6 +38,7 @@ hypre_ParCSRMatrix **P_array; hypre_ParCSRMatrix **R_array; hypre_ParCSRMatrix *Lambda; + hypre_ParCSRMatrix *Atilde; hypre_ParVector **F_array; hypre_ParVector **U_array; hypre_ParVector *Vtemp; @@ -46,11 +47,12 @@ HYPRE_Int **CF_marker_array; HYPRE_Int num_levels; - HYPRE_Int addlvl; + HYPRE_Int addlvl, add_end; HYPRE_Int additive; HYPRE_Int mult_additive; HYPRE_Int simple; - HYPRE_Int i, num_rows; + HYPRE_Int add_last_lvl; + HYPRE_Int i, j, num_rows; HYPRE_Int n_global; HYPRE_Int rlx_order; @@ -61,7 +63,9 @@ HYPRE_Int fine_grid; HYPRE_Int rlx_down; HYPRE_Int rlx_up; + HYPRE_Int rlx_coarse; HYPRE_Int *grid_relax_type; + HYPRE_Int *num_grid_sweeps; HYPRE_Real **l1_norms; HYPRE_Real alpha, beta; HYPRE_Real *u_data; @@ -92,8 +96,10 @@ additive = hypre_ParAMGDataAdditive(amg_data); mult_additive = hypre_ParAMGDataMultAdditive(amg_data); simple = hypre_ParAMGDataSimple(amg_data); + add_last_lvl = hypre_ParAMGDataAddLastLvl(amg_data); grid_relax_type = hypre_ParAMGDataGridRelaxType(amg_data); Lambda = hypre_ParAMGDataLambda(amg_data); + Atilde = hypre_ParAMGDataAtilde(amg_data); Xtilde = hypre_ParAMGDataXtilde(amg_data); Rtilde = hypre_ParAMGDataRtilde(amg_data); l1_norms = hypre_ParAMGDataL1Norms(amg_data); @@ -101,11 +107,14 @@ relax_weight = hypre_ParAMGDataRelaxWeight(amg_data); omega = hypre_ParAMGDataOmega(amg_data); rlx_order = hypre_ParAMGDataRelaxOrder(amg_data); + num_grid_sweeps = hypre_ParAMGDataNumGridSweeps(amg_data); /* Initialize */ addlvl = hypre_max(additive, mult_additive); addlvl = hypre_max(addlvl, simple); + if (add_last_lvl == -1 ) add_end = num_levels-1; + else add_end = add_last_lvl; Solve_err_flag = 0; /*--------------------------------------------------------------------- @@ -115,6 +124,7 @@ /* down cycle */ rlx_down = grid_relax_type[1]; rlx_up = grid_relax_type[2]; + rlx_coarse = grid_relax_type[3]; for (level = 0; level < num_levels-1; level++) { fine_grid = level; @@ -126,7 +136,7 @@ hypre_ParVectorSetConstantValues(U_array[coarse_grid], 0.0); - if (level < addlvl) /* multiplicative version */ + if (level < addlvl || level > add_end) /* multiplicative version */ { /* smoothing step */ @@ -134,33 +144,42 @@ { HYPRE_Real *A_data = hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(A_array[fine_grid])); HYPRE_Int *A_i = hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(A_array[fine_grid])); - hypre_ParVectorCopy(F_array[fine_grid],Vtemp); num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[fine_grid])); + for (j=0; j < num_grid_sweeps[1]; j++) + { + hypre_ParVectorCopy(F_array[fine_grid],Vtemp); #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE #endif - for (i = 0; i < num_rows; i++) + for (i = 0; i < num_rows; i++) u_data[i] = relax_weight[level]*v_data[i] / A_data[A_i[i]]; + } } else if (rlx_down != 18) { /*hypre_BoomerAMGRelax(A_array[fine_grid],F_array[fine_grid],NULL,rlx_down,0,*/ - hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid], - CF_marker_array[fine_grid], rlx_down,rlx_order,1, - relax_weight[fine_grid], omega[fine_grid], - l1_norms[level], U_array[fine_grid], Vtemp, Ztemp); - hypre_ParVectorCopy(F_array[fine_grid],Vtemp); + for (j=0; j < num_grid_sweeps[1]; j++) + { + hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid], + CF_marker_array[fine_grid], rlx_down,rlx_order,1, + relax_weight[fine_grid], omega[fine_grid], + l1_norms[level], U_array[fine_grid], Vtemp, Ztemp); + hypre_ParVectorCopy(F_array[fine_grid],Vtemp); + } } else { - hypre_ParVectorCopy(F_array[fine_grid],Vtemp); num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[fine_grid])); + for (j=0; j < num_grid_sweeps[1]; j++) + { + hypre_ParVectorCopy(F_array[fine_grid],Vtemp); #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE #endif - for (i = 0; i < num_rows; i++) + for (i = 0; i < num_rows; i++) u_data[i] += v_data[i] / l1_norms_lvl[i]; + } } alpha = -1.0; @@ -188,7 +207,7 @@ } } - /* solve coarse grid */ + /* additive smoothing and solve coarse grid */ if (addlvl < num_levels) { if (simple > -1) @@ -203,16 +222,38 @@ x_global[i] += D_inv[i]*r_global[i]; } else - hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 1.0, Xtilde); + { + if (num_grid_sweeps[1] > 1) + { + n_global = hypre_VectorSize(hypre_ParVectorLocalVector(Rtilde)); + hypre_ParVector *Tmptilde = hypre_CTAlloc(hypre_ParVector, 1); + hypre_Vector *Tmptilde_local = hypre_SeqVectorCreate(n_global); + hypre_SeqVectorInitialize(Tmptilde_local); + hypre_ParVectorLocalVector(Tmptilde) = Tmptilde_local; + hypre_ParVectorOwnsData(Tmptilde) = 1; + hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 0.0, Tmptilde); + hypre_ParVectorScale(2.0,Rtilde); + hypre_ParCSRMatrixMatvec(-1.0, Atilde, Tmptilde, 1.0, Rtilde); + hypre_ParVectorDestroy(Tmptilde); + } + hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 1.0, Xtilde); + } if (addlvl == 0) hypre_ParVectorCopy(Xtilde, U_array[0]); } - else + if (add_end < num_levels -1) { fine_grid = num_levels -1; - hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid], + for (j=0; j < num_grid_sweeps[3]; j++) + if (rlx_coarse == 18) + hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid], 1, 1, l1_norms[fine_grid], 1.0, 1.0 ,0,0,0,0, U_array[fine_grid], Vtemp, Ztemp); + else + hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid], + NULL, rlx_coarse,0,0, + relax_weight[fine_grid], omega[fine_grid], + l1_norms[fine_grid], U_array[fine_grid], Vtemp, Ztemp); } /* up cycle */ @@ -221,7 +262,7 @@ fine_grid = level - 1; coarse_grid = level; - if (level <= addlvl) /* multiplicative version */ + if (level <= addlvl || level > add_end+1) /* multiplicative version */ { alpha = 1.0; beta = 1.0; @@ -230,7 +271,8 @@ beta, U_array[fine_grid]); if (rlx_up != 18) /*hypre_BoomerAMGRelax(A_array[fine_grid],F_array[fine_grid],NULL,rlx_up,0,*/ - hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid], + for (j=0; j < num_grid_sweeps[2]; j++) + hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid], CF_marker_array[fine_grid], rlx_up,rlx_order,2, relax_weight[fine_grid], omega[fine_grid], @@ -240,6 +282,7 @@ HYPRE_Int loc_relax_points[2]; loc_relax_points[0] = -1; loc_relax_points[1] = 1; + for (j=0; j < num_grid_sweeps[2]; j++) for (i=0; i < 2; i++) hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid],F_array[fine_grid], CF_marker_array[fine_grid], @@ -248,6 +291,7 @@ U_array[fine_grid], Vtemp); } else + for (j=0; j < num_grid_sweeps[2]; j++) hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid], 1, 1, l1_norms[fine_grid], 1.0, 1.0 ,0,0,0,0, @@ -281,6 +325,11 @@ hypre_ParCSRMatrix *Lambda; hypre_CSRMatrix *L_diag; hypre_CSRMatrix *L_offd; + hypre_ParCSRMatrix *Atilde; + hypre_CSRMatrix *Atilde_diag; + hypre_CSRMatrix *Atilde_offd; + HYPRE_Real *Atilde_diag_data; + HYPRE_Real *Atilde_offd_data; hypre_CSRMatrix *A_tmp_diag; hypre_CSRMatrix *A_tmp_offd; hypre_ParVector *Xtilde; @@ -305,6 +354,10 @@ HYPRE_Int *L_diag_j; HYPRE_Int *L_offd_i; HYPRE_Int *L_offd_j; + HYPRE_Int *Atilde_diag_i; + HYPRE_Int *Atilde_diag_j; + HYPRE_Int *Atilde_offd_i; + HYPRE_Int *Atilde_offd_j; HYPRE_Int *A_tmp_diag_i; HYPRE_Int *A_tmp_offd_i; HYPRE_Int *A_tmp_diag_j; @@ -340,10 +393,12 @@ HYPRE_Int num_cols_offd = 0; HYPRE_Int level, i, j, k; HYPRE_Int this_proc, cnt, cnt_diag, cnt_offd; + HYPRE_Int A_cnt_diag, A_cnt_offd; HYPRE_Int cnt_recv, cnt_send, cnt_row, row_start; HYPRE_Int start_diag, start_offd, indx, cnt_map; HYPRE_Int start, j_indx, index, cnt_level; HYPRE_Int max_sends, max_recvs; + HYPRE_Int ns; /* Local variables */ HYPRE_Int Solve_err_flag = 0; @@ -354,6 +409,7 @@ /*HYPRE_Real *relax_weight = NULL; HYPRE_Int relax_type; */ HYPRE_Int add_rlx; + HYPRE_Int add_last_lvl, add_end; HYPRE_Real add_rlx_wt; /* Acquire data and allocate storage */ @@ -363,19 +419,23 @@ U_array = hypre_ParAMGDataUArray(amg_data); additive = hypre_ParAMGDataAdditive(amg_data); mult_additive = hypre_ParAMGDataMultAdditive(amg_data); + add_last_lvl = hypre_ParAMGDataAddLastLvl(amg_data); num_levels = hypre_ParAMGDataNumLevels(amg_data); /*relax_weight = hypre_ParAMGDataRelaxWeight(amg_data); relax_type = hypre_ParAMGDataGridRelaxType(amg_data)[1];*/ comm = hypre_ParCSRMatrixComm(A_array[0]); add_rlx = hypre_ParAMGDataAddRelaxType(amg_data); add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data); + ns = hypre_ParAMGDataNumGridSweeps(amg_data)[1]; hypre_MPI_Comm_size(comm,&num_procs); l1_norms_ptr = hypre_ParAMGDataL1Norms(amg_data); addlvl = hypre_max(additive, mult_additive); - num_add_lvls = num_levels+1-addlvl; + if (add_last_lvl != -1) add_end = add_last_lvl+1; + else add_end = num_levels; + num_add_lvls = add_end+1-addlvl; level_start = hypre_CTAlloc(HYPRE_Int, num_add_lvls+1); send_data_L = 0; @@ -387,7 +447,7 @@ cnt = 1; max_sends = 0; max_recvs = 0; - for (i=addlvl; i < num_levels; i++) + for (i=addlvl; i < add_end; i++) { A_tmp = A_array[i]; A_tmp_diag = hypre_ParCSRMatrixDiag(A_tmp); @@ -426,7 +486,7 @@ { if (max_sends < num_procs && max_recvs < num_procs) { - for (i=addlvl; i < num_levels; i++) + for (i=addlvl; i < add_end; i++) { A_tmp = A_array[i]; comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp); @@ -482,7 +542,7 @@ L_recv_ptr = hypre_CTAlloc(HYPRE_Int, num_recvs_L+1); L_send_ptr = hypre_CTAlloc(HYPRE_Int, num_sends_L+1); - for (i=addlvl; i < num_levels; i++) + for (i=addlvl; i < add_end; i++) { A_tmp = A_array[i]; comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp); @@ -524,7 +584,7 @@ { num_recvs_L = 0; num_sends_L = 0; - for (i=addlvl; i < num_levels; i++) + for (i=addlvl; i < add_end; i++) { A_tmp = A_array[i]; comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp); @@ -591,6 +651,7 @@ L_offd = hypre_CSRMatrixCreate(num_rows_L, num_cols_offd_L, num_nonzeros_offd); hypre_CSRMatrixInitialize(L_diag); hypre_CSRMatrixInitialize(L_offd); + if (num_nonzeros_diag) { L_diag_data = hypre_CSRMatrixData(L_diag); @@ -604,6 +665,26 @@ } L_offd_i = hypre_CSRMatrixI(L_offd); + if (ns > 1) + { + Atilde_diag = hypre_CSRMatrixCreate(num_rows_L, num_rows_L, num_nonzeros_diag); + Atilde_offd = hypre_CSRMatrixCreate(num_rows_L, num_cols_offd_L, num_nonzeros_offd); + hypre_CSRMatrixInitialize(Atilde_diag); + hypre_CSRMatrixInitialize(Atilde_offd); + if (num_nonzeros_diag) + { + Atilde_diag_data = hypre_CSRMatrixData(Atilde_diag); + Atilde_diag_j = hypre_CSRMatrixJ(Atilde_diag); + } + Atilde_diag_i = hypre_CSRMatrixI(Atilde_diag); + if (num_nonzeros_offd) + { + Atilde_offd_data = hypre_CSRMatrixData(Atilde_offd); + Atilde_offd_j = hypre_CSRMatrixJ(Atilde_offd); + } + Atilde_offd_i = hypre_CSRMatrixI(Atilde_offd); + } + if (num_rows_L) D_data = hypre_CTAlloc(HYPRE_Real,num_rows_L); if (send_data_L) { @@ -639,7 +720,14 @@ cnt_row = 1; L_diag_i[0] = 0; L_offd_i[0] = 0; - for (level=addlvl; level < num_levels; level++) + if (ns > 1) + { + A_cnt_diag = 0; + A_cnt_offd = 0; + Atilde_diag_i[0] = 0; + Atilde_offd_i[0] = 0; + } + for (level=addlvl; level < add_end; level++) { row_start = level_start[cnt_level]; if (level != 0) @@ -716,11 +804,17 @@ #pragma omp for private(i) HYPRE_SMP_SCHEDULE #endif for (i=0; i < num_rows_tmp; i++) - { + { D_data[i] = add_rlx_wt/A_tmp_diag_data[A_tmp_diag_i[i]]; L_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1]; L_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1]; - } + } + if (ns > 1) + for (i=0; i < num_rows_tmp; i++) + { + Atilde_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1]; + Atilde_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1]; + } } else { @@ -734,6 +828,12 @@ L_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1]; L_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1]; } + if (ns > 1) + for (i=0; i < num_rows_tmp; i++) + { + Atilde_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1]; + Atilde_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1]; + } } if (num_procs > 1) @@ -754,6 +854,11 @@ for (i = 0; i < num_rows_tmp; i++) { j_indx = A_tmp_diag_i[i]; + if (ns > 1) + { + Atilde_diag_data[A_cnt_diag] = A_tmp_diag_data[j_indx]; + Atilde_diag_j[A_cnt_diag++] = i+row_start; + } L_diag_data[cnt_diag] = (2.0 - A_tmp_diag_data[j_indx]*D_data[i])*D_data[i]; L_diag_j[cnt_diag++] = i+row_start; for (j=A_tmp_diag_i[i]+1; j < A_tmp_diag_i[i+1]; j++) @@ -768,6 +873,21 @@ L_offd_data[cnt_offd] = (- A_tmp_offd_data[j]*D_data_offd[j_indx])*D_data[i]; L_offd_j[cnt_offd++] = remap[j_indx]; } + if (ns > 1) + { + for (j=A_tmp_diag_i[i]+1; j < A_tmp_diag_i[i+1]; j++) + { + j_indx = A_tmp_diag_j[j]; + Atilde_diag_data[A_cnt_diag] = A_tmp_diag_data[j]; + Atilde_diag_j[A_cnt_diag++] = j_indx+row_start; + } + for (j=A_tmp_offd_i[i]; j < A_tmp_offd_i[i+1]; j++) + { + j_indx = A_tmp_offd_j[j]; + Atilde_offd_data[A_cnt_offd] = A_tmp_offd_data[j]; + Atilde_offd_j[A_cnt_offd++] = remap[j_indx]; + } + } } cnt_row += num_rows_tmp; } @@ -809,6 +929,52 @@ hypre_ParCSRMatrixComm(Lambda) = comm; hypre_ParCSRMatrixOwnsData(Lambda) = 1; + if (ns > 1) + { + /*hypre_ParCSRCommPkg *A_comm_pkg = NULL; + HYPRE_Int *A_recv_ptr = NULL; + HYPRE_Int *A_send_ptr = NULL; + HYPRE_Int *A_recv_procs = NULL; + HYPRE_Int *A_send_procs = NULL; + HYPRE_Int *A_send_map_elmts = NULL; + + A_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg,1); + + A_recv_ptr = hypre_CTAlloc(HYPRE_Int, num_recvs+1); + A_send_ptr = hypre_CTAlloc(HYPRE_Int, num_sends+1); + A_recv_procs = hypre_CTAlloc(HYPRE_Int, num_recvs_L); + A_send_procs = hypre_CTAlloc(HYPRE_Int, num_sends_L); + A_send_map_elmts = hypre_CTAlloc(HYPRE_Int, L_send_ptr[num_sends_L]); + + for (i=0; i CF_marker_array) +#define hypre_ParAMGDataCPointMarkerArray(amg_data) ((amg_data)-> C_point_marker_array) #define hypre_ParAMGDataAArray(amg_data) ((amg_data)->A_array) #define hypre_ParAMGDataFArray(amg_data) ((amg_data)->F_array) #define hypre_ParAMGDataUArray(amg_data) ((amg_data)->U_array) @@ -332,9 +343,13 @@ #define hypre_ParAMGDataMaxEigEst(amg_data) ((amg_data)->max_eig_est) #define hypre_ParAMGDataMinEigEst(amg_data) ((amg_data)->min_eig_est) +#define hypre_ParAMGDataChebyEigEst(amg_data) ((amg_data)->cheby_eig_est) +#define hypre_ParAMGDataChebyVariant(amg_data) ((amg_data)->cheby_variant) +#define hypre_ParAMGDataChebyScale(amg_data) ((amg_data)->cheby_scale) #define hypre_ParAMGDataChebyOrder(amg_data) ((amg_data)->cheby_order) #define hypre_ParAMGDataChebyFraction(amg_data) ((amg_data)->cheby_fraction) - +#define hypre_ParAMGDataChebyDS(amg_data) ((amg_data)->cheby_ds) +#define hypre_ParAMGDataChebyCoefs(amg_data) ((amg_data)->cheby_coefs) /* block */ #define hypre_ParAMGDataABlockArray(amg_data) ((amg_data)->A_block_array) @@ -408,11 +423,13 @@ #define hypre_ParAMGDataAdditive(amg_data) ((amg_data)->additive) #define hypre_ParAMGDataMultAdditive(amg_data) ((amg_data)->mult_additive) #define hypre_ParAMGDataSimple(amg_data) ((amg_data)->simple) +#define hypre_ParAMGDataAddLastLvl(amg_data) ((amg_data)->add_last_lvl) #define hypre_ParAMGDataMultAddPMaxElmts(amg_data) ((amg_data)->add_P_max_elmts) #define hypre_ParAMGDataMultAddTruncFactor(amg_data) ((amg_data)->add_trunc_factor) #define hypre_ParAMGDataAddRelaxType(amg_data) ((amg_data)->add_rlx_type) #define hypre_ParAMGDataAddRelaxWt(amg_data) ((amg_data)->add_rlx_wt) #define hypre_ParAMGDataLambda(amg_data) ((amg_data)->Lambda) +#define hypre_ParAMGDataAtilde(amg_data) ((amg_data)->Atilde) #define hypre_ParAMGDataRtilde(amg_data) ((amg_data)->Rtilde) #define hypre_ParAMGDataXtilde(amg_data) ((amg_data)->Xtilde) #define hypre_ParAMGDataDinv(amg_data) ((amg_data)->D_inv) @@ -425,6 +442,8 @@ #define hypre_ParAMGDataRAP2(amg_data) ((amg_data)->rap2) #define hypre_ParAMGDataKeepTranspose(amg_data) ((amg_data)->keepTranspose) + + #endif diff -Nru hypre-2.11.2/src/parcsr_ls/par_amg_setup.c hypre-2.13.0/src/parcsr_ls/par_amg_setup.c --- hypre-2.11.2/src/parcsr_ls/par_amg_setup.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_amg_setup.c 2017-10-20 17:42:22.000000000 +0000 @@ -79,10 +79,11 @@ HYPRE_Int mult_additive = hypre_ParAMGDataMultAdditive(amg_data); HYPRE_Int additive = hypre_ParAMGDataAdditive(amg_data); HYPRE_Int simple = hypre_ParAMGDataSimple(amg_data); + HYPRE_Int add_last_lvl = hypre_ParAMGDataAddLastLvl(amg_data); HYPRE_Int add_P_max_elmts = hypre_ParAMGDataMultAddPMaxElmts(amg_data); HYPRE_Real add_trunc_factor = hypre_ParAMGDataMultAddTruncFactor(amg_data); HYPRE_Int add_rlx = hypre_ParAMGDataAddRelaxType(amg_data); - HYPRE_Real add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data); + HYPRE_Real add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data); hypre_ParCSRBlockMatrix **A_block_array, **P_block_array; @@ -99,8 +100,11 @@ hypre_ParCSRMatrix *AN = NULL; hypre_ParCSRMatrix *P1; hypre_ParCSRMatrix *P2; + hypre_ParCSRMatrix *Pnew = NULL; HYPRE_Real *SmoothVecs = NULL; HYPRE_Real **l1_norms = NULL; + HYPRE_Real **cheby_ds = NULL; + HYPRE_Real **cheby_coefs = NULL; HYPRE_Int old_num_levels, num_levels; HYPRE_Int level; @@ -183,7 +187,28 @@ HYPRE_Int rap2 = hypre_ParAMGDataRAP2(amg_data); HYPRE_Int keepTranspose = hypre_ParAMGDataKeepTranspose(amg_data); + HYPRE_Int **C_point_marker_array; + HYPRE_Int local_coarse_size; + HYPRE_Int num_C_point_coarse = hypre_ParAMGDataNumCPointKeep(amg_data); + HYPRE_Int *C_point_keep; + + HYPRE_Int *num_grid_sweeps = hypre_ParAMGDataNumGridSweeps(amg_data); + HYPRE_Int ns = num_grid_sweeps[1]; HYPRE_Real wall_time; /* for debugging instrumentation */ + HYPRE_Int add_end; + +#ifdef HYPRE_USE_GPU + if (!hypre_ParCSRMatrixIsManaged(A)){ + hypre_fprintf(stderr,"ERROR:: INVALID A in hypre_BoomerAMGSetup::Address %p\n",A); + //exit(2); + } else if(!hypre_ParVectorIsManaged(f)){ + hypre_fprintf(stderr,"ERROR:: INVALID f in hypre_BoomerAMGSetup::Address %p\n",f); + //exit(2); + } else if (!hypre_ParVectorIsManaged(u)){ + hypre_fprintf(stderr,"ERROR:: INVALID u in hypre_BoomerAMGSetup::Address %p\n",u); + //exit(2); + } +#endif /*hypre_CSRMatrix *A_new;*/ @@ -196,6 +221,8 @@ hypre_CSRMatrixPrint(A_new, "Atestnew"); */ old_num_levels = hypre_ParAMGDataNumLevels(amg_data); max_levels = hypre_ParAMGDataMaxLevels(amg_data); + add_end = hypre_min(add_last_lvl, max_levels-1); + if (add_end == -1) add_end = max_levels-1; amg_logging = hypre_ParAMGDataLogging(amg_data); amg_print_level = hypre_ParAMGDataPrintLevel(amg_data); coarsen_type = hypre_ParAMGDataCoarsenType(amg_data); @@ -246,6 +273,8 @@ grid_relax_type[3] = hypre_ParAMGDataUserCoarseRelaxType(amg_data); + C_point_marker_array = hypre_ParAMGDataCPointKeepMarkerArray(amg_data); + HYPRE_ANNOTATION_BEGIN("BoomerAMG.setup"); /* change in definition of standard and multipass interpolation, by @@ -596,6 +625,7 @@ dof_func_array[0] = dof_func; hypre_ParAMGDataCFMarkerArray(amg_data) = CF_marker_array; + hypre_ParAMGDataCPointKeepMarkerArray(amg_data) = C_point_marker_array; hypre_ParAMGDataDofFuncArray(amg_data) = dof_func_array; hypre_ParAMGDataAArray(amg_data) = A_array; hypre_ParAMGDataPArray(amg_data) = P_array; @@ -745,7 +775,6 @@ while (not_finished_coarsening) { - /* only do nodal coarsening on a fixed number of levels */ if (level >= nodal_levels) { @@ -1086,6 +1115,43 @@ AN = NULL; } } + + /**************************************************/ + /*********Set the fixed index to CF_marker*********/ + //num_C_point_coarse + + if (hypre_ParAMGDataCPointKeepLevel(amg_data) > 0) + { + if (block_mode) + { + printf("Keeping coarse nodes in block mode is not implemented\n"); + } + else if (level < hypre_ParAMGDataCPointKeepLevel(amg_data)) + { + C_point_keep = C_point_marker_array[level]; + if (level < hypre_ParAMGDataCPointKeepLevel(amg_data)-1) + C_point_marker_array[level+1] = hypre_CTAlloc(HYPRE_Int, num_C_point_coarse); + + for(j = 0;j < num_C_point_coarse;j++) + { + CF_marker[C_point_keep[j]] = 2; + } + + local_coarse_size = 0; + k = 0; + for (j = 0; j < local_num_vars; j ++) + { + if (CF_marker[j] == 1) local_coarse_size++; + if (CF_marker[j] == 2) { + if (level < hypre_ParAMGDataCPointKeepLevel(amg_data)-1) + C_point_marker_array[level+1][k++] = local_coarse_size; + local_coarse_size++; + CF_marker[j] = 1; + } + } + } + } + /*****xxxxxxxxxxxxx changes for min_coarse_size */ /* here we will determine the coarse grid size to be able to determine if it is not smaller than requested minimal size */ @@ -1144,7 +1210,8 @@ if (S) hypre_ParCSRMatrixDestroy(S); if (SN) hypre_ParCSRMatrixDestroy(SN); if (AN) hypre_ParCSRMatrixDestroy(AN); - hypre_TFree(CF_marker); + if (num_functions > 1) hypre_TFree(coarse_dof_func); + hypre_TFree(CF_marker); hypre_TFree(coarse_pnts_global); if (level > 0) { @@ -1389,7 +1456,6 @@ fflush(NULL); } - if (debug_flag==1) wall_time = time_getWallclockSeconds(); if (interp_type == 4) @@ -1895,11 +1961,9 @@ if (!block_mode) { - if (mult_addlvl > -1 && level >= mult_addlvl) + if (mult_addlvl > -1 && level >= mult_addlvl && level <= add_end) { HYPRE_Real *d_diag; - hypre_ParCSRMatrix *Q = NULL; - Q = hypre_ParMatmul(A_array[level],P); if (add_rlx == 0) { hypre_CSRMatrix *lvl_Adiag = hypre_ParCSRMatrixDiag(A_array[level]); @@ -1919,52 +1983,110 @@ else hypre_ParCSRComputeL1NormsThreads(A_array[level], 1, num_threads, NULL, &d_diag); } - hypre_ParCSRMatrixAminvDB(P,Q,d_diag,&P_array[level]); - A_H = hypre_ParTMatmul(P,Q); - hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H); - hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1; - hypre_ParCSRMatrixOwnsColStarts(A_H) = 0; - hypre_ParCSRMatrixOwnsColStarts(P) = 0; - if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); - /*hypre_ParCSRMatrixDestroy(P); */ - hypre_TFree(d_diag); - /*hypre_BoomerAMGBuildCoarseOperator(P, A_array[level] , P, &A_H); - hypre_ParCSRMatrix *C = NULL; - HYPRE_Int *num_grid_sweeps - = hypre_ParAMGDataNumGridSweeps(amg_data); - if (grid_relax_type[1] == 18) - C = hypre_CreateC(A_array[level], 0.0); - else - C = hypre_CreateC(A_array[level], relax_weight[level]); - if (num_grid_sweeps[1] > 1) + if (ns == 1) { - hypre_ParCSRMatrix *Pnew = NULL; - Pnew = hypre_ParMatmul(C,P); - P_array[level] = hypre_ParMatmul(C,Pnew); - hypre_ParCSRMatrixDestroy(Pnew); + hypre_ParCSRMatrix *Q = NULL; + Q = hypre_ParMatmul(A_array[level],P); + hypre_ParCSRMatrixAminvDB(P,Q,d_diag,&P_array[level]); + A_H = hypre_ParTMatmul(P,Q); + hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H); + hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1; + hypre_ParCSRMatrixOwnsColStarts(A_H) = 0; + hypre_ParCSRMatrixOwnsColStarts(P) = 0; + if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); + /*hypre_ParCSRMatrixDestroy(P); */ + hypre_TFree(d_diag); + /* Set NonGalerkin drop tol on each level */ + if (level < nongalerk_num_tol) nongalerk_tol_l = nongalerk_tol[level]; + if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level]; + if (nongalerk_tol_l > 0.0) + { + /* Build Non-Galerkin Coarse Grid */ + hypre_ParCSRMatrix *Q = NULL; + hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q, + 0.333*strong_threshold, max_row_sum, num_functions, + dof_func_array[level+1], S_commpkg_switch, CF_marker_array[level], + /* nongalerk_tol, sym_collapse, lump_percent, beta );*/ + nongalerk_tol_l, 1, 0.5, 1.0 ); + + hypre_ParCSRMatrixColStarts(P_array[level]) = hypre_ParCSRMatrixRowStarts(A_H); + if (!hypre_ParCSRMatrixCommPkg(A_H)) + hypre_MatvecCommPkgCreate(A_H); + } + hypre_ParCSRMatrixDestroy(Q); + } - else - P_array[level] = hypre_ParMatmul(C,P); - hypre_ParCSRMatrixDestroy(C); */ - - /* Set NonGalerkin drop tol on each level */ - if (level < nongalerk_num_tol) nongalerk_tol_l = nongalerk_tol[level]; - if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level]; - if (nongalerk_tol_l > 0.0) + else { - /* Build Non-Galerkin Coarse Grid */ - hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q, + HYPRE_Int ns_tmp = ns; + hypre_ParCSRMatrix *C = NULL; + hypre_ParCSRMatrix *Ptmp = NULL; + /* Set NonGalerkin drop tol on each level */ + if (level < nongalerk_num_tol) + nongalerk_tol_l = nongalerk_tol[level]; + if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level]; + + if (nongalerk_tol_l > 0.0) + { + /* Construct AP, and then RAP */ + hypre_ParCSRMatrix *Q = NULL; + Q = hypre_ParMatmul(A_array[level],P_array[level]); + A_H = hypre_ParTMatmul(P_array[level],Q); + hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H); + hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1; + hypre_ParCSRMatrixOwnsColStarts(A_H) = 0; + hypre_ParCSRMatrixOwnsColStarts(P_array[level]) = 0; + if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); + + /* Build Non-Galerkin Coarse Grid */ + hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q, 0.333*strong_threshold, max_row_sum, num_functions, dof_func_array[level+1], S_commpkg_switch, CF_marker_array[level], /* nongalerk_tol, sym_collapse, lump_percent, beta );*/ nongalerk_tol_l, 1, 0.5, 1.0 ); - hypre_ParCSRMatrixColStarts(P_array[level]) = hypre_ParCSRMatrixRowStarts(A_H); - if (!hypre_ParCSRMatrixCommPkg(A_H)) - hypre_MatvecCommPkgCreate(A_H); - + if (!hypre_ParCSRMatrixCommPkg(A_H)) + hypre_MatvecCommPkgCreate(A_H); + + /* Delete AP */ + hypre_ParCSRMatrixDestroy(Q); + } + else if (rap2) + { + /* Use two matrix products to generate A_H */ + hypre_ParCSRMatrix *Q = NULL; + Q = hypre_ParMatmul(A_array[level],P_array[level]); + A_H = hypre_ParTMatmul(P_array[level],Q); + hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1; + hypre_ParCSRMatrixOwnsColStarts(A_H) = 0; + hypre_ParCSRMatrixOwnsColStarts(P_array[level]) = 0; + if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); + /* Delete AP */ + hypre_ParCSRMatrixDestroy(Q); + } + else + hypre_BoomerAMGBuildCoarseOperatorKT(P, A_array[level] , P, + keepTranspose, &A_H); + + if (add_rlx == 18) + C = hypre_CreateC(A_array[level], 0.0); + else + C = hypre_CreateC(A_array[level], add_rlx_wt); + Ptmp = P; + while (ns_tmp > 0) + { + Pnew = Ptmp; + Ptmp = NULL; + Ptmp = hypre_ParMatmul(C,Pnew); + if (ns_tmp < ns) + hypre_ParCSRMatrixDestroy(Pnew); + ns_tmp--; + } + Pnew = Ptmp; + P_array[level] = Pnew; + hypre_ParCSRMatrixDestroy(C); } - hypre_ParCSRMatrixDestroy(Q); + if (add_P_max_elmts || add_trunc_factor) @@ -2012,7 +2134,7 @@ A_block_array[level+1] = A_H_block; } - else if (mult_addlvl == -1 || level < mult_addlvl) + else if (mult_addlvl == -1 || level < mult_addlvl || level > add_end) { /* Set NonGalerkin drop tol on each level */ if (level < nongalerk_num_tol) @@ -2062,6 +2184,12 @@ /* Compute standard Galerkin coarse-grid product */ hypre_BoomerAMGBuildCoarseOperatorKT(P_array[level], A_array[level] , P_array[level], keepTranspose, &A_H); + if (Pnew && ns==1) + { + hypre_ParCSRMatrixDestroy(P); + P_array[level] = Pnew; + } + } } @@ -2103,6 +2231,7 @@ if ( (seq_threshold >= coarse_threshold) && (coarse_size > coarse_threshold) && (level != max_levels-1)) { hypre_seqAMGSetup( amg_data, level, coarse_threshold); + } else if (grid_relax_type[3] == 9 || grid_relax_type[3] == 99) /*use of Gaussian elimination on coarsest level */ { @@ -2153,7 +2282,7 @@ hypre_ParVectorSetPartitioningOwner(U_array[level],0); } } - + /*----------------------------------------------------------------------- * enter all the stuff created, A[level], P[level], CF_marker[level], * for levels 1 through coarsest, into amg_data data structure @@ -2170,6 +2299,7 @@ *-----------------------------------------------------------------------*/ if (addlvl > -1 || + grid_relax_type[1] == 7 || grid_relax_type[2] == 7 || grid_relax_type[3] == 7 || grid_relax_type[1] == 8 || grid_relax_type[2] == 8 || grid_relax_type[3] == 8 || grid_relax_type[1] == 13 || grid_relax_type[2] == 13 || grid_relax_type[3] == 13 || grid_relax_type[1] == 14 || grid_relax_type[2] == 14 || grid_relax_type[3] == 14 || @@ -2185,6 +2315,10 @@ min_eig_est = hypre_CTAlloc(HYPRE_Real, num_levels); hypre_ParAMGDataMaxEigEst(amg_data) = max_eig_est; hypre_ParAMGDataMinEigEst(amg_data) = min_eig_est; + cheby_ds = hypre_CTAlloc(HYPRE_Real *, num_levels); + cheby_coefs = hypre_CTAlloc(HYPRE_Real *, num_levels); + hypre_ParAMGDataChebyDS(amg_data) = cheby_ds; + hypre_ParAMGDataChebyCoefs(amg_data) = cheby_coefs; } if (grid_relax_type[0] == 15 ||grid_relax_type[1] == 15 || grid_relax_type[2] == 15 || grid_relax_type[3] == 15) /* CG */ @@ -2253,7 +2387,7 @@ } } } - for (j = addlvl; j < num_levels; j++) + for (j = addlvl; j < hypre_min(add_end+1, num_levels) ; j++) { if (add_rlx == 18 ) { @@ -2263,15 +2397,92 @@ hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]); } } + for (j = add_end+1; j < num_levels; j++) + { + if (num_threads == 1) + { + if (j < num_levels-1 && (grid_relax_type[1] == 8 || grid_relax_type[1] == 13 || + grid_relax_type[1] == 14 || grid_relax_type[2] == 8 || grid_relax_type[2] == 13 || + grid_relax_type[2] == 14)) + { + if (relax_order) + hypre_ParCSRComputeL1Norms(A_array[j], 4, CF_marker_array[j], &l1_norms[j]); + else + hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]); + } + else if ((grid_relax_type[3] == 8 || grid_relax_type[3] == 13 || grid_relax_type[3] == 14) + && j == num_levels-1) + { + hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]); + } + if ((grid_relax_type[1] == 18 || grid_relax_type[2] == 18) && j < num_levels-1) + { + if (relax_order) + hypre_ParCSRComputeL1Norms(A_array[j], 1, CF_marker_array[j], &l1_norms[j]); + else + hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]); + } + else if (grid_relax_type[3] == 18 && j == num_levels-1) + { + hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]); + } + } + else + { + if (j < num_levels-1 && (grid_relax_type[1] == 8 || grid_relax_type[1] == 13 || + grid_relax_type[1] == 14 || grid_relax_type[2] == 8 || grid_relax_type[2] == 13 || + grid_relax_type[2] == 14)) + { + if (relax_order) + hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, CF_marker_array[j] , &l1_norms[j]); + else + hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]); + } + else if ((grid_relax_type[3] == 8 || grid_relax_type[3] == 13 || grid_relax_type[3] == 14) + && j == num_levels-1) + { + hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]); + } + if ((grid_relax_type[1] == 18 || grid_relax_type[2] == 18) && j < num_levels-1) + { + if (relax_order) + hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, CF_marker_array[j], &l1_norms[j]); + else + hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]); + } + else if (grid_relax_type[3] == 18 && j == num_levels-1) + { + hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]); + } + } + } for (j = 0; j < num_levels; j++) { - if (grid_relax_type[1] == 16 || grid_relax_type[2] == 16 || (grid_relax_type[3] == 16 && j== (num_levels-1))) + if (grid_relax_type[1] == 7 || grid_relax_type[2] == 7 || (grid_relax_type[3] == 7 && j== (num_levels-1))) { - HYPRE_Int scale = 1; - HYPRE_Real temp_d, temp_d2; - hypre_ParCSRMaxEigEstimateCG(A_array[j], scale, 10, &temp_d, &temp_d2); - max_eig_est[j] = temp_d; - min_eig_est[j] = temp_d2; + hypre_ParCSRComputeL1Norms(A_array[j], 5, NULL, &l1_norms[j]); + } + else if (grid_relax_type[1] == 16 || grid_relax_type[2] == 16 || (grid_relax_type[3] == 16 && j== (num_levels-1))) + { + HYPRE_Int scale = hypre_ParAMGDataChebyScale(amg_data);; + HYPRE_Int variant = hypre_ParAMGDataChebyVariant(amg_data); + HYPRE_Real max_eig, min_eig = 0; + HYPRE_Real *coefs = NULL; + HYPRE_Real *ds = NULL; + HYPRE_Int cheby_order = hypre_ParAMGDataChebyOrder(amg_data); + HYPRE_Int cheby_eig_est = hypre_ParAMGDataChebyEigEst(amg_data); + HYPRE_Real cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data); + if (cheby_eig_est) + hypre_ParCSRMaxEigEstimateCG(A_array[j], scale, cheby_eig_est, + &max_eig, &min_eig); + else + hypre_ParCSRMaxEigEstimate(A_array[j], scale, &max_eig); + max_eig_est[j] = max_eig; + min_eig_est[j] = min_eig; + hypre_ParCSRRelax_Cheby_Setup(A_array[j],max_eig, min_eig, + cheby_fraction, cheby_order, scale, variant, &coefs, &ds); + cheby_coefs[j] = coefs; + cheby_ds[j] = ds; } else if (grid_relax_type[1] == 15 || (grid_relax_type[3] == 15 && j == (num_levels-1)) ) { diff -Nru hypre-2.11.2/src/parcsr_ls/par_cheby.c hypre-2.13.0/src/parcsr_ls/par_cheby.c --- hypre-2.11.2/src/parcsr_ls/par_cheby.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_cheby.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,338 @@ +/****************************************************************************** + * + * Chebyshev setup and solve + * + *****************************************************************************/ + +#include "_hypre_parcsr_ls.h" +#include "_hypre_parcsr_mv.h" +#include "float.h" + + +/****************************************************************************** + +Chebyshev relaxation + + +Can specify order 1-4 (this is the order of the resid polynomial)- here we +explicitly code the coefficients (instead of +iteratively determining) + + +variant 0: standard chebyshev +this is rlx 11 if scale = 0, and 16 if scale == 1 + +variant 1: modified cheby: T(t)* f(t) where f(t) = (1-b/t) +this is rlx 15 if scale = 0, and 17 if scale == 1 + +ratio indicates the percentage of the whole spectrum to use (so .5 +means half, and .1 means 10percent) + + +*******************************************************************************/ + +HYPRE_Int hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A, /* matrix to relax with */ + HYPRE_Real max_eig, + HYPRE_Real min_eig, + HYPRE_Real fraction, + HYPRE_Int order, /* polynomial order */ + HYPRE_Int scale, /* scale by diagonal?*/ + HYPRE_Int variant, + HYPRE_Real **coefs_ptr, + HYPRE_Real **ds_ptr) /* initial/updated approximation */ +{ + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + + HYPRE_Real theta, delta; + + HYPRE_Real den; + HYPRE_Real upper_bound, lower_bound; + + HYPRE_Int j; + HYPRE_Int num_rows = hypre_CSRMatrixNumRows(A_diag); + + HYPRE_Real *coefs = NULL; + + HYPRE_Int cheby_order; + + HYPRE_Real *ds_data = NULL; + HYPRE_Real diag; + + /* u = u + p(A)r */ + + if (order > 4) + order = 4; + if (order < 1) + order = 1; + + coefs = hypre_CTAlloc(HYPRE_Real, order+1); + /* we are using the order of p(A) */ + cheby_order = order -1; + + /* make sure we are large enough - Adams et al. 2003 */ + upper_bound = max_eig * 1.1; + /* lower_bound = max_eig/fraction; */ + lower_bound = (upper_bound - min_eig)* fraction + min_eig; + + + /* theta and delta */ + theta = (upper_bound + lower_bound)/2; + delta = (upper_bound - lower_bound)/2; + + if (variant == 1 ) + { + switch ( cheby_order ) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0 - so order is + one less that resid poly: r(t) = 1 - t*s(t) */ + { + case 0: + coefs[0] = 1.0/theta; + + break; + + case 1: /* (del - t + 2*th)/(th^2 + del*th) */ + den = (theta*theta + delta*theta); + + coefs[0] = (delta + 2*theta)/den; + coefs[1] = -1.0/den; + + break; + + case 2: /* (4*del*th - del^2 - t*(2*del + 6*th) + 2*t^2 + 6*th^2)/(2*del*th^2 - del^2*th - del^3 + 2*th^3)*/ + den = 2*delta*theta*theta - delta*delta*theta - pow(delta,3) + 2*pow(theta,3); + + coefs[0] = (4*delta*theta - pow(delta,2) + 6*pow(theta,2))/den; + coefs[1] = -(2*delta + 6*theta)/den; + coefs[2] = 2/den; + + break; + + case 3: /* -(6*del^2*th - 12*del*th^2 - t^2*(4*del + 16*th) + t*(12*del*th - 3*del^2 + 24*th^2) + 3*del^3 + 4*t^3 - 16*th^3)/(4*del*th^3 - 3*del^2*th^2 - 3*del^3*th + 4*th^4)*/ + den = - (4*delta*pow(theta,3) - 3*pow(delta,2)*pow(theta,2) - 3*pow(delta,3)*theta + 4*pow(theta,4) ); + + coefs[0] = (6*pow(delta,2)*theta - 12*delta*pow(theta,2) + 3*pow(delta,3) - 16*pow(theta,3) )/den; + coefs[1] = (12*delta*theta - 3*pow(delta,2) + 24*pow(theta,2))/den; + coefs[2] = -( 4*delta + 16*theta)/den; + coefs[3] = 4/den; + + break; + } + } + + else /* standard chebyshev */ + { + + switch ( cheby_order ) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0 - so order is + one less thatn resid poly: r(t) = 1 - t*s(t) */ + { + case 0: + coefs[0] = 1.0/theta; + break; + + case 1: /* ( 2*t - 4*th)/(del^2 - 2*th^2) */ + den = delta*delta - 2*theta*theta; + + coefs[0] = -4*theta/den; + coefs[1] = 2/den; + + break; + + case 2: /* (3*del^2 - 4*t^2 + 12*t*th - 12*th^2)/(3*del^2*th - 4*th^3)*/ + den = 3*(delta*delta)*theta - 4*(theta*theta*theta); + + coefs[0] = (3*delta*delta - 12 *theta*theta)/den; + coefs[1] = 12*theta/den; + coefs[2] = -4/den; + + break; + + case 3: /*(t*(8*del^2 - 48*th^2) - 16*del^2*th + 32*t^2*th - 8*t^3 + 32*th^3)/(del^4 - 8*del^2*th^2 + 8*th^4)*/ + den = pow(delta,4) - 8*delta*delta*theta*theta + 8*pow(theta,4); + + coefs[0] = (32*pow(theta,3)- 16*delta*delta*theta)/den; + coefs[1] = (8*delta*delta - 48*theta*theta)/den; + coefs[2] = 32*theta/den; + coefs[3] = -8/den; + + break; + } + } + *coefs_ptr = coefs; + + if (scale) + { + /*grab 1/sqrt(diagonal) */ + ds_data = hypre_CTAlloc(HYPRE_Real, num_rows); + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j,diag) HYPRE_SMP_SCHEDULE +#endif + for (j = 0; j < num_rows; j++) + { + diag = A_diag_data[A_diag_i[j]]; + ds_data[j] = 1/sqrt(diag); + } + + }/* end of scaling code */ + *ds_ptr = ds_data; + + return hypre_error_flag; +} + +HYPRE_Int hypre_ParCSRRelax_Cheby_Solve(hypre_ParCSRMatrix *A, /* matrix to relax with */ + hypre_ParVector *f, /* right-hand side */ + HYPRE_Real *ds_data, + HYPRE_Real *coefs, + HYPRE_Int order, /* polynomial order */ + HYPRE_Int scale, /* scale by diagonal?*/ + HYPRE_Int variant, + hypre_ParVector *u, /* initial/updated approximation */ + hypre_ParVector *v /* temporary vector */, + hypre_ParVector *r /*another temp vector */ ) +{ + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *u_data = hypre_VectorData(hypre_ParVectorLocalVector(u)); + HYPRE_Real *f_data = hypre_VectorData(hypre_ParVectorLocalVector(f)); + HYPRE_Real *v_data = hypre_VectorData(hypre_ParVectorLocalVector(v)); + + HYPRE_Real *r_data = hypre_VectorData(hypre_ParVectorLocalVector(r)); + + HYPRE_Int i, j; + HYPRE_Int num_rows = hypre_CSRMatrixNumRows(A_diag); + + HYPRE_Real mult; + HYPRE_Real *orig_u; + + HYPRE_Int cheby_order; + + HYPRE_Real *tmp_data; + + hypre_ParVector *tmp_vec; + + /* u = u + p(A)r */ + + if (order > 4) + order = 4; + if (order < 1) + order = 1; + + /* we are using the order of p(A) */ + cheby_order = order -1; + + orig_u = hypre_CTAlloc(HYPRE_Real, num_rows); + + if (!scale) + { + /* get residual: r = f - A*u */ + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); + + for ( i = 0; i < num_rows; i++ ) + { + orig_u[i] = u_data[i]; + u_data[i] = r_data[i] * coefs[cheby_order]; + } + for (i = cheby_order - 1; i >= 0; i-- ) + { + hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v); + mult = coefs[i]; +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + u_data[j] = mult * r_data[j] + v_data[j]; + } + } + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for ( i = 0; i < num_rows; i++ ) + { + u_data[i] = orig_u[i] + u_data[i]; + } + } + else /* scaling! */ + { + + /*grab 1/sqrt(diagonal) */ + + tmp_vec = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A), + hypre_ParCSRMatrixGlobalNumRows(A), + hypre_ParCSRMatrixRowStarts(A)); + hypre_ParVectorInitialize(tmp_vec); + hypre_ParVectorSetPartitioningOwner(tmp_vec,0); + tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); + + /* get ds_data and get scaled residual: r = D^(-1/2)f - + * D^(-1/2)A*u */ + + hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec); +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]); + } + + /* save original u, then start + the iteration by multiplying r by the cheby coef.*/ + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + orig_u[j] = u_data[j]; /* orig, unscaled u */ + + u_data[j] = r_data[j] * coefs[cheby_order]; + } + + /* now do the other coefficients */ + for (i = cheby_order - 1; i >= 0; i-- ) + { + /* v = D^(-1/2)AD^(-1/2)u */ +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + tmp_data[j] = ds_data[j] * u_data[j]; + } + hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v); + + /* u_new = coef*r + v*/ + mult = coefs[i]; + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + u_data[j] = mult * r_data[j] + ds_data[j]*v_data[j]; + } + + } /* end of cheby_order loop */ + + /* now we have to scale u_data before adding it to u_orig*/ + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + u_data[j] = orig_u[j] + ds_data[j]*u_data[j]; + } + + hypre_ParVectorDestroy(tmp_vec); + + }/* end of scaling code */ + + hypre_TFree(orig_u); + + return hypre_error_flag; +} + diff -Nru hypre-2.11.2/src/parcsr_ls/par_coarsen.c hypre-2.13.0/src/parcsr_ls/par_coarsen.c --- hypre-2.11.2/src/parcsr_ls/par_coarsen.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_coarsen.c 2017-10-20 17:42:22.000000000 +0000 @@ -1118,7 +1118,8 @@ } else { - if (measure < 0) hypre_printf("negative measure!\n"); + if (measure < 0) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"negative measure!\n"); + /*if (measure < 0) hypre_printf("negative measure!\n");*/ CF_marker[j] = f_pnt; for (k = S_i[j]; k < S_i[j+1]; k++) { diff -Nru hypre-2.11.2/src/parcsr_ls/par_cr.c hypre-2.13.0/src/parcsr_ls/par_cr.c --- hypre-2.11.2/src/parcsr_ls/par_cr.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_cr.c 2017-10-20 17:42:22.000000000 +0000 @@ -90,7 +90,7 @@ hypre_fprintf(stdout,"-----------------------\n"); for (i = 0; i < n; i++) - e1[i] = 1.0e0+.1*rand(); + e1[i] = 1.0e0+.1*hypre_RandI(); /* stages */ while(1){ @@ -133,8 +133,8 @@ if (cf[i] == cpt) nc+=1.0e0; else if (cf[i] == fpt){ - e0[i] = 1.0e0+.1*rand(); - e1[i] = 1.0e0+.1*rand(); + e0[i] = 1.0e0+.1*hypre_RandI(); + e1[i] = 1.0e0+.1*hypre_RandI(); } } nstages += 1; @@ -2735,7 +2735,7 @@ for (i = 0; i < num_variables; i++) e1[i] = 1.0e0; - /*e1[i] = 1.0e0+.1*rand();*/ + /*e1[i] = 1.0e0+.1*hypre_RandI();*/ /* stages */ while(1) @@ -2852,7 +2852,7 @@ if (CF_marker[i] == fpt) { e1[i] = 1.0e0; - /*e1[i] = 1.0e0+.1*rand();*/ + /*e1[i] = 1.0e0+.1*hypre_RandI();*/ e0[i] = e1[i]; } } @@ -3011,8 +3011,8 @@ num_coarse++; else if (CF_marker[i] == fpt) { - e0[i] = 1.0e0+.1*rand(); - e1[i] = 1.0e0+.1*rand(); + e0[i] = 1.0e0+.1*hypre_RandI(); + e1[i] = 1.0e0+.1*hypre_RandI(); } } } @@ -3035,8 +3035,8 @@ for (j=0; j < num_functions; j++) { /*CF_marker[jj] = CFN_marker[i]; - e0[jj] = 1.0e0+.1*rand(); - e1[jj++] = 1.0e0+.1*rand();*/ + e0[jj] = 1.0e0+.1*hypre_RandI(); + e1[jj++] = 1.0e0+.1*hypre_RandI();*/ e0[jj] = 1.0e0; e1[jj++] = 1.0e0; } diff -Nru hypre-2.11.2/src/parcsr_ls/par_cycle.c hypre-2.13.0/src/parcsr_ls/par_cycle.c --- hypre-2.11.2/src/parcsr_ls/par_cycle.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_cycle.c 2017-10-20 17:42:22.000000000 +0000 @@ -76,10 +76,7 @@ HYPRE_Int block_mode; - HYPRE_Real *max_eig_est; - HYPRE_Real *min_eig_est; HYPRE_Int cheby_order; - HYPRE_Real cheby_fraction; /* Local variables */ HYPRE_Int *lev_counter; @@ -111,6 +108,8 @@ HYPRE_Real alpha; HYPRE_Real **l1_norms = NULL; HYPRE_Real *l1_norms_level; + HYPRE_Real **ds = hypre_ParAMGDataChebyDS(amg_data); + HYPRE_Real **coefs = hypre_ParAMGDataChebyCoefs(amg_data); HYPRE_Int seq_cg = 0; @@ -158,10 +157,10 @@ l1_norms = hypre_ParAMGDataL1Norms(amg_data); /* smooth_option = hypre_ParAMGDataSmoothOption(amg_data); */ - max_eig_est = hypre_ParAMGDataMaxEigEst(amg_data); + /*max_eig_est = hypre_ParAMGDataMaxEigEst(amg_data); min_eig_est = hypre_ParAMGDataMinEigEst(amg_data); + cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);*/ cheby_order = hypre_ParAMGDataChebyOrder(amg_data); - cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data); cycle_op_count = hypre_ParAMGDataCycleOpCount(amg_data); @@ -434,6 +433,18 @@ } else /* not CF - so use through AMS */ { +#ifdef HYPRE_USE_GPU + hypre_ParCSRRelax(A_array[level], + Aux_F, + 1, + 1, + l1_norms_level, + relax_weight[level], + omega[level],0,0,0,0, + Aux_U, + Vtemp, + Ztemp); +#else if (num_threads == 1) hypre_ParCSRRelax(A_array[level], Aux_F, @@ -457,6 +468,7 @@ Aux_U, Vtemp, Ztemp); +#endif } } else if (relax_type == 15) @@ -470,13 +482,11 @@ } else if (relax_type == 16) { /* scaled Chebyshev */ - HYPRE_Int scale = 1; - HYPRE_Int variant = 0; - hypre_ParCSRRelax_Cheby(A_array[level], - Aux_F, - max_eig_est[level], - min_eig_est[level], - cheby_fraction, cheby_order, scale, + HYPRE_Int scale = hypre_ParAMGDataChebyScale(amg_data); + HYPRE_Int variant = hypre_ParAMGDataChebyVariant(amg_data); + hypre_ParCSRRelax_Cheby_Solve(A_array[level], Aux_F, + ds[level], coefs[level], + cheby_order, scale, variant, Aux_U, Vtemp, Ztemp ); } else if (relax_type ==17) diff -Nru hypre-2.11.2/src/parcsr_ls/par_gsmg.c hypre-2.13.0/src/parcsr_ls/par_gsmg.c --- hypre-2.11.2/src/parcsr_ls/par_gsmg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_gsmg.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,10 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - - - /****************************************************************************** * * Geometrically smooth interpolation multigrid @@ -27,20 +23,7 @@ #include "_hypre_parcsr_ls.h" #include "par_amg.h" -#ifdef HYPRE_USING_ESSL -#include -#else -#include "fortran.h" -#ifdef __cplusplus -extern "C" { -#endif -HYPRE_Int hypre_F90_NAME_LAPACK(dgels, DGELS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, - HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -#ifdef __cplusplus -} -#endif - -#endif +#include "_hypre_lapack.h" #ifndef ABS #define ABS(x) ((x)>0 ? (x) : -(x)) @@ -563,7 +546,7 @@ for (sample=0; sample + +/* Create */ +void * +hypre_MGRCreate() +{ + hypre_ParMGRData *mgr_data; + + mgr_data = hypre_CTAlloc(hypre_ParMGRData, 1); + + /* block data */ + (mgr_data -> block_size) = 1; + (mgr_data -> num_coarse_indexes) = 1; + (mgr_data -> block_num_coarse_indexes) = NULL; + (mgr_data -> block_cf_marker) = NULL; + + /* general data */ + (mgr_data -> max_num_coarse_levels) = 10; + (mgr_data -> A_array) = NULL; + (mgr_data -> P_array) = NULL; + (mgr_data -> RT_array) = NULL; + (mgr_data -> RAP) = NULL; + (mgr_data -> CF_marker_array) = NULL; + (mgr_data -> coarse_indices_lvls) = NULL; + + (mgr_data -> F_array) = NULL; + (mgr_data -> U_array) = NULL; + (mgr_data -> residual) = NULL; + (mgr_data -> rel_res_norms) = NULL; + (mgr_data -> Vtemp) = NULL; + (mgr_data -> Ztemp) = NULL; + (mgr_data -> Utemp) = NULL; + (mgr_data -> Ftemp) = NULL; + + (mgr_data -> num_iterations) = 0; + (mgr_data -> num_interp_sweeps) = 1; + (mgr_data -> trunc_factor) = 0.0; + (mgr_data -> max_row_sum) = 0.9; + (mgr_data -> strong_threshold) = 0.25; + (mgr_data -> S_commpkg_switch) = 1.0; + (mgr_data -> P_max_elmts) = 0; + + (mgr_data -> coarse_grid_solver) = NULL; + (mgr_data -> coarse_grid_solver_setup) = NULL; + (mgr_data -> coarse_grid_solver_solve) = NULL; + + (mgr_data -> global_smoother) = NULL; + + (mgr_data -> use_default_cgrid_solver) = 1; + (mgr_data -> omega) = 1.; + (mgr_data -> max_iter) = 20; + (mgr_data -> tol) = 1.0e-7; + (mgr_data -> relax_type) = 0; + (mgr_data -> relax_order) = 1; + (mgr_data -> interp_type) = 2; + (mgr_data -> restrict_type) = 0; + (mgr_data -> num_relax_sweeps) = 1; + (mgr_data -> relax_weight) = 1.0; + + (mgr_data -> logging) = 0; + (mgr_data -> print_level) = 0; + + (mgr_data -> l1_norms) = NULL; + + (mgr_data -> reserved_coarse_size) = 0; + (mgr_data -> reserved_coarse_indexes) = NULL; + (mgr_data -> reserved_Cpoint_local_indexes) = NULL; + + (mgr_data -> diaginv) = NULL; + (mgr_data -> global_smooth_iters) = 1; + (mgr_data -> global_smooth_type) = 0; + + (mgr_data -> set_non_Cpoints_to_F) = 0; + + (mgr_data -> Frelax_method) = 0; + (mgr_data -> FrelaxVcycleData) = NULL; + (mgr_data -> max_local_lvls) = 10; + + return (void *) mgr_data; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ +/* Destroy */ +HYPRE_Int +hypre_MGRDestroy( void *data ) +{ + hypre_ParMGRData * mgr_data = (hypre_ParMGRData*) data; + + HYPRE_Int i; + HYPRE_Int num_coarse_levels = (mgr_data -> num_coarse_levels); + + /* block info data */ + if ((mgr_data -> block_cf_marker)) + { + for (i=0; i < (mgr_data -> max_num_coarse_levels); i++) + { + if ((mgr_data -> block_cf_marker)[i]) + { + hypre_TFree((mgr_data -> block_cf_marker)[i]); + } + } + hypre_TFree((mgr_data -> block_cf_marker)); + (mgr_data -> block_cf_marker) = NULL; + } + + if(mgr_data -> block_num_coarse_indexes) + { + hypre_TFree(mgr_data -> block_num_coarse_indexes); + (mgr_data -> block_num_coarse_indexes) = NULL; + } + + /* final residual vector */ + if((mgr_data -> residual)) + { + hypre_ParVectorDestroy( (mgr_data -> residual) ); + (mgr_data -> residual) = NULL; + } + if((mgr_data -> rel_res_norms)) + { + hypre_TFree( (mgr_data -> rel_res_norms) ); + (mgr_data -> rel_res_norms) = NULL; + } + /* temp vectors for solve phase */ + if((mgr_data -> Vtemp)) + { + hypre_ParVectorDestroy( (mgr_data -> Vtemp) ); + (mgr_data -> Vtemp) = NULL; + } + if((mgr_data -> Ztemp)) + { + hypre_ParVectorDestroy( (mgr_data -> Ztemp) ); + (mgr_data -> Ztemp) = NULL; + } + if((mgr_data -> Utemp)) + { + hypre_ParVectorDestroy( (mgr_data -> Utemp) ); + (mgr_data -> Utemp) = NULL; + } + if((mgr_data -> Ftemp)) + { + hypre_ParVectorDestroy( (mgr_data -> Ftemp) ); + (mgr_data -> Ftemp) = NULL; + } + /* coarse grid solver */ + if((mgr_data -> use_default_cgrid_solver)) + { + if((mgr_data -> coarse_grid_solver)) + hypre_BoomerAMGDestroy( (mgr_data -> coarse_grid_solver) ); + (mgr_data -> coarse_grid_solver) = NULL; + } + /* l1_norms */ + if ((mgr_data -> l1_norms)) + { + for (i=0; i < (num_coarse_levels); i++) + if ((mgr_data -> l1_norms)[i]) + hypre_TFree((mgr_data -> l1_norms)[i]); + hypre_TFree((mgr_data -> l1_norms)); + } + + /* coarse_indices_lvls */ + if ((mgr_data -> coarse_indices_lvls)) + { + for (i=0; i < (num_coarse_levels); i++) + if ((mgr_data -> coarse_indices_lvls)[i]) + hypre_TFree((mgr_data -> coarse_indices_lvls)[i]); + hypre_TFree((mgr_data -> coarse_indices_lvls)); + } + + /* linear system and cf marker array */ + if(mgr_data -> A_array || mgr_data -> P_array || mgr_data -> RT_array || mgr_data -> CF_marker_array) + { + for (i=1; i < num_coarse_levels+1; i++) { + hypre_ParVectorDestroy((mgr_data -> F_array)[i]); + hypre_ParVectorDestroy((mgr_data -> U_array)[i]); + + if ((mgr_data -> P_array)[i-1]) + hypre_ParCSRMatrixDestroy((mgr_data -> P_array)[i-1]); + + if ((mgr_data -> RT_array)[i-1]) + hypre_ParCSRMatrixDestroy((mgr_data -> RT_array)[i-1]); + + hypre_TFree((mgr_data -> CF_marker_array)[i-1]); + } + for (i=1; i < (num_coarse_levels); i++) { + if ((mgr_data -> A_array)[i]) + hypre_ParCSRMatrixDestroy((mgr_data -> A_array)[i]); + } + } + + if((mgr_data -> F_array)) + { + hypre_TFree((mgr_data -> F_array)); + (mgr_data -> F_array) = NULL; + } + if((mgr_data -> U_array)) + { + hypre_TFree((mgr_data -> U_array)); + (mgr_data -> U_array) = NULL; + } + if((mgr_data -> A_array)) + { + hypre_TFree((mgr_data -> A_array)); + (mgr_data -> A_array) = NULL; + } + if((mgr_data -> P_array)) + { + hypre_TFree((mgr_data -> P_array)); + (mgr_data -> P_array) = NULL; + } + if((mgr_data -> RT_array)) + { + hypre_TFree((mgr_data -> RT_array)); + (mgr_data -> RT_array) = NULL; + } + if((mgr_data -> CF_marker_array)) + { + hypre_TFree((mgr_data -> CF_marker_array)); + (mgr_data -> CF_marker_array) = NULL; + } + if((mgr_data -> reserved_Cpoint_local_indexes)) + { + hypre_TFree((mgr_data -> reserved_Cpoint_local_indexes)); + (mgr_data -> reserved_Cpoint_local_indexes) = NULL; + } + + /* data for V-cycle F-relaxation */ + if (mgr_data -> FrelaxVcycleData) { + for (i = 0; i < num_coarse_levels; i++) { + if ((mgr_data -> FrelaxVcycleData)[i]) { + hypre_MGRDestroyFrelaxVcycleData((mgr_data -> FrelaxVcycleData)[i]); + (mgr_data -> FrelaxVcycleData)[i] = NULL; + } + } + hypre_TFree(mgr_data -> FrelaxVcycleData); + mgr_data -> FrelaxVcycleData = NULL; + } + /* data for reserved coarse nodes */ + if(mgr_data -> reserved_coarse_indexes) + { + hypre_TFree(mgr_data -> reserved_coarse_indexes); + (mgr_data -> reserved_coarse_indexes) = NULL; + } + /* coarse level matrix - RAP */ + if ((mgr_data -> RAP)) + hypre_ParCSRMatrixDestroy((mgr_data -> RAP)); + if ((mgr_data -> diaginv)) + hypre_TFree((mgr_data -> diaginv)); + /* mgr data */ + hypre_TFree(mgr_data); + + return hypre_error_flag; +} + +/* Create data for V-cycle F-relaxtion */ +void * +hypre_MGRCreateFrelaxVcycleData() +{ + hypre_ParAMGData *vdata = hypre_CTAlloc(hypre_ParAMGData, 1); + + hypre_ParAMGDataAArray(vdata) = NULL; + hypre_ParAMGDataPArray(vdata) = NULL; + hypre_ParAMGDataFArray(vdata) = NULL; + hypre_ParAMGDataCFMarkerArray(vdata) = NULL; + hypre_ParAMGDataVtemp(vdata) = NULL; + hypre_ParAMGDataAMat(vdata) = NULL; + hypre_ParAMGDataBVec(vdata) = NULL; + hypre_ParAMGDataZtemp(vdata) = NULL; + hypre_ParAMGDataCommInfo(vdata) = NULL; + hypre_ParAMGDataUArray(vdata) = NULL; + hypre_ParAMGDataNewComm(vdata) = hypre_MPI_COMM_NULL; + hypre_ParAMGDataNumLevels(vdata) = 0; + hypre_ParAMGDataMaxLevels(vdata) = 10; + + return (void *) vdata; +} +/* Destroy data for V-cycle F-relaxation */ +HYPRE_Int +hypre_MGRDestroyFrelaxVcycleData( void *data ) +{ + hypre_ParAMGData * vdata = (hypre_ParAMGData*) data; + HYPRE_Int i; + HYPRE_Int num_levels = hypre_ParAMGDataNumLevels(vdata); + MPI_Comm new_comm = hypre_ParAMGDataNewComm(vdata); + + for (i=1; i < num_levels; i++) + { + hypre_ParVectorDestroy(hypre_ParAMGDataFArray(vdata)[i]); + hypre_ParVectorDestroy(hypre_ParAMGDataUArray(vdata)[i]); + + if (hypre_ParAMGDataAArray(vdata)[i]) + hypre_ParCSRMatrixDestroy(hypre_ParAMGDataAArray(vdata)[i]); + + if (hypre_ParAMGDataPArray(vdata)[i-1]) + hypre_ParCSRMatrixDestroy(hypre_ParAMGDataPArray(vdata)[i-1]); + + hypre_TFree(hypre_ParAMGDataCFMarkerArray(vdata)[i-1]); + } + + /* see comments in par_coarsen.c regarding special case for CF_marker */ + if (num_levels == 1) + { + hypre_TFree(hypre_ParAMGDataCFMarkerArray(vdata)[0]); + } + +/* Points to vtemp of mgr_data, which is already destroyed */ +// hypre_ParVectorDestroy(hypre_ParAMGDataVtemp(vdata)); + hypre_TFree(hypre_ParAMGDataFArray(vdata)); + hypre_TFree(hypre_ParAMGDataUArray(vdata)); + hypre_TFree(hypre_ParAMGDataAArray(vdata)); + hypre_TFree(hypre_ParAMGDataPArray(vdata)); + hypre_TFree(hypre_ParAMGDataCFMarkerArray(vdata)); + +/* Points to ztemp of mgr_data, which is already destroyed */ +/* + if (hypre_ParAMGDataZtemp(vdata)) + hypre_ParVectorDestroy(hypre_ParAMGDataZtemp(vdata)); +*/ + + if (hypre_ParAMGDataAMat(vdata)) hypre_TFree(hypre_ParAMGDataAMat(vdata)); + if (hypre_ParAMGDataBVec(vdata)) hypre_TFree(hypre_ParAMGDataBVec(vdata)); + if (hypre_ParAMGDataCommInfo(vdata)) hypre_TFree(hypre_ParAMGDataCommInfo(vdata)); + + if (new_comm != hypre_MPI_COMM_NULL) + { + hypre_MPI_Comm_free (&new_comm); + } + hypre_TFree(vdata); + + return hypre_error_flag; +} + +/* Set C-point variables for each reduction level */ +/* Currently not implemented */ +HYPRE_Int +hypre_MGRSetReductionLevelCpoints( void *mgr_vdata, + HYPRE_Int nlevels, + HYPRE_Int *num_coarse_points, + HYPRE_Int **level_coarse_indexes) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> num_coarse_levels) = nlevels; + (mgr_data -> num_coarse_per_level) = num_coarse_points; + (mgr_data -> level_coarse_indexes) = level_coarse_indexes; + return hypre_error_flag; +} + +/* Initialize some data */ +/* Set whether non-coarse points on each level should be explicitly tagged as F-points */ +HYPRE_Int +hypre_MGRSetNonCpointsToFpoints( void *mgr_vdata, HYPRE_Int nonCptToFptFlag) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> set_non_Cpoints_to_F) = nonCptToFptFlag; + + return hypre_error_flag; +} + +/* Initialize/ set block data information */ +HYPRE_Int +hypre_MGRSetCpointsByBlock( void *mgr_vdata, + HYPRE_Int block_size, + HYPRE_Int max_num_levels, + HYPRE_Int *block_num_coarse_points, + HYPRE_Int **block_coarse_indexes) +{ + HYPRE_Int i,j; + HYPRE_Int **block_cf_marker = NULL; + HYPRE_Int *block_num_coarse_indexes = NULL; + + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + /* free block cf_marker data if not previously destroyed */ + if((mgr_data -> block_cf_marker) != NULL) + { + for (i=0; i < (mgr_data -> max_num_coarse_levels); i++) + { + if ((mgr_data -> block_cf_marker)[i]) + { + hypre_TFree ((mgr_data -> block_cf_marker)[i]); + (mgr_data -> block_cf_marker)[i] = NULL; + } + } + hypre_TFree (mgr_data -> block_cf_marker); + (mgr_data -> block_cf_marker) = NULL; + } + if((mgr_data -> block_num_coarse_indexes)) + { + hypre_TFree((mgr_data -> block_num_coarse_indexes)); + (mgr_data -> block_num_coarse_indexes) = NULL; + } + + /* store block cf_marker */ + block_cf_marker = hypre_CTAlloc(HYPRE_Int *,max_num_levels); + for (i = 0; i < max_num_levels; i++) + { + block_cf_marker[i] = hypre_CTAlloc(HYPRE_Int,block_size); + memset(block_cf_marker[i], FMRK, block_size*sizeof(HYPRE_Int)); + } + for (i = 0; i < max_num_levels; i++) + { + for(j=0; j 0) + { + block_num_coarse_indexes = hypre_CTAlloc(HYPRE_Int, max_num_levels); + for(i=0; i max_num_coarse_levels) = max_num_levels; + (mgr_data -> block_size) = block_size; + (mgr_data -> block_num_coarse_indexes) = block_num_coarse_indexes; + (mgr_data -> block_cf_marker) = block_cf_marker; + + return hypre_error_flag; +} + +/*Set number of points that remain part of the coarse grid throughout the hierarchy */ +HYPRE_Int +hypre_MGRSetReservedCoarseNodes(void *mgr_vdata, + HYPRE_Int reserved_coarse_size, + HYPRE_Int *reserved_cpt_index) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + HYPRE_Int *reserved_coarse_indexes = NULL; + HYPRE_Int i; + + if (!mgr_data) + { + hypre_printf("Warning! MGR object empty!\n"); + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if(reserved_coarse_size < 0) + { + hypre_error_in_arg(2); + return hypre_error_flag; + } + /* free data not previously destroyed */ + if((mgr_data -> reserved_coarse_indexes)) + { + hypre_TFree((mgr_data -> reserved_coarse_indexes)); + (mgr_data -> reserved_coarse_indexes) = NULL; + } + + /* set reserved coarse nodes */ + if(reserved_coarse_size > 0) + { + reserved_coarse_indexes = hypre_CTAlloc(HYPRE_Int, reserved_coarse_size); + for(i=0; i reserved_coarse_size) = reserved_coarse_size; + (mgr_data -> reserved_coarse_indexes) = reserved_coarse_indexes; + + return hypre_error_flag; +} + +/* Set CF marker array */ +HYPRE_Int +hypre_MGRCoarsen(hypre_ParCSRMatrix *S, + hypre_ParCSRMatrix *A, + HYPRE_Int fixed_coarse_size, + HYPRE_Int *fixed_coarse_indexes, + HYPRE_Int debug_flag, + HYPRE_Int **CF_marker, + HYPRE_Int cflag) +{ + HYPRE_Int *cf_marker, i, row, nc; + HYPRE_Int *cindexes = fixed_coarse_indexes; + + HYPRE_Int nloc = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A)); + + /* If this is the last level, coarsen onto fixed coarse set */ + if(cflag) + { + if(*CF_marker != NULL) + { + hypre_TFree (*CF_marker); + } + cf_marker = hypre_CTAlloc(HYPRE_Int,nloc); + memset(cf_marker, FMRK, nloc*sizeof(HYPRE_Int)); + + /* first mark fixed coarse set */ + nc = fixed_coarse_size; + for(i = 0; i < nc; i++) + { + cf_marker[cindexes[i]] = CMRK; + } + } + else { + /* First coarsen to get initial CF splitting. + * This is then followed by updating the CF marker to pass + * coarse information to the next levels. NOTE: It may be + * convenient to implement this way (allows the use of multiple + * coarsening strategies without changing too much code), + * but not necessarily the best option, compared to initializing + * CF_marker first and then coarsening on subgraph which excludes + * the initialized coarse nodes. + */ + hypre_BoomerAMGCoarsen(S, A, 0, debug_flag, &cf_marker); + + /* Update CF_marker to correct Cpoints marked as Fpoints. */ + nc = fixed_coarse_size; + for(i = 0; i < nc; i++) + { + cf_marker[cindexes[i]] = CMRK; + } + /* set F-points to FMRK. This is necessary since the different coarsening schemes differentiate + * between type of F-points (example Ruge coarsening). We do not need that distinction here. + */ + for (row = 0; row = 0) + { + jj_count[j]++; + fine_to_coarse[i] = coarse_counter[j]; + coarse_counter[j]++; + } + /*-------------------------------------------------------------------- + * If i is an F-point, interpolation is the approximation of A_{ff}^{-1}A_{fc} + *--------------------------------------------------------------------*/ + else + { + for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) + { + i1 = A_diag_j[jj]; + if (CF_marker[i1] >= 0) + { + jj_count[j]++; + } + } + + if (num_procs > 1) + { + for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++) + { + i1 = A_offd_j[jj]; + if (CF_marker_offd[i1] >= 0) + { + jj_count_offd[j]++; + } + } + } + } + } + } + + /*----------------------------------------------------------------------- + * Allocate arrays. + *-----------------------------------------------------------------------*/ + for (i=0; i < num_threads-1; i++) + { + coarse_counter[i+1] += coarse_counter[i]; + jj_count[i+1] += jj_count[i]; + jj_count_offd[i+1] += jj_count_offd[i]; + } + i = num_threads-1; + jj_counter = jj_count[i]; + jj_counter_offd = jj_count_offd[i]; + + P_diag_size = jj_counter; + + P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine+1); + P_diag_j = hypre_CTAlloc(HYPRE_Int, P_diag_size); + P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_size); + + P_diag_i[n_fine] = jj_counter; + + + P_offd_size = jj_counter_offd; + + P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine+1); + P_offd_j = hypre_CTAlloc(HYPRE_Int, P_offd_size); + P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_size); + + /*----------------------------------------------------------------------- + * Intialize some stuff. + *-----------------------------------------------------------------------*/ + + jj_counter = start_indexing; + jj_counter_offd = start_indexing; + + if (debug_flag==4) + { + wall_time = time_getWallclockSeconds() - wall_time; + hypre_printf("Proc = %d Interp: Internal work 1 = %f\n", + my_id, wall_time); + fflush(NULL); + } + + /*----------------------------------------------------------------------- + * Send and receive fine_to_coarse info. + *-----------------------------------------------------------------------*/ + + if (debug_flag==4) wall_time = time_getWallclockSeconds(); + + fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE +#endif +#endif + for (j = 0; j < num_threads; j++) + { + coarse_shift = 0; + if (j > 0) coarse_shift = coarse_counter[j-1]; + size = n_fine/num_threads; + rest = n_fine - size*num_threads; + if (j < rest) + { + ns = j*size+j; + ne = (j+1)*size+j+1; + } + else + { + ns = j*size+rest; + ne = (j+1)*size+rest; + } + for (i = ns; i < ne; i++) + fine_to_coarse[i] += my_first_cpt+coarse_shift; + } + + index = 0; + for (i = 0; i < num_sends; i++) + { + start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) + int_buf_data[index++] + = fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; + } + + comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, + fine_to_coarse_offd); + + hypre_ParCSRCommHandleDestroy(comm_handle); + + if (debug_flag==4) + { + wall_time = time_getWallclockSeconds() - wall_time; + hypre_printf("Proc = %d Interp: Comm 4 FineToCoarse = %f\n", + my_id, wall_time); + fflush(NULL); + } + + if (debug_flag==4) wall_time = time_getWallclockSeconds(); + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt; + + /*----------------------------------------------------------------------- + * Loop over fine grid points. + *-----------------------------------------------------------------------*/ + a_diag = hypre_CTAlloc(HYPRE_Real, n_fine); + for (i = 0; i < n_fine; i++) + { + for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) + { + i1 = A_diag_j[jj]; + if ( i==i1 ) /* diagonal of A only */ + { + a_diag[i] = 1.0/A_diag_data[jj]; + } + } + } + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i,j,jl,i1,jj,ns,ne,size,rest,P_marker,P_marker_offd,jj_counter,jj_counter_offd,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE +#endif +#endif + for (jl = 0; jl < num_threads; jl++) + { + size = n_fine/num_threads; + rest = n_fine - size*num_threads; + if (jl < rest) + { + ns = jl*size+jl; + ne = (jl+1)*size+jl+1; + } + else + { + ns = jl*size+rest; + ne = (jl+1)*size+rest; + } + jj_counter = 0; + if (jl > 0) jj_counter = jj_count[jl-1]; + jj_counter_offd = 0; + if (jl > 0) jj_counter_offd = jj_count_offd[jl-1]; + P_marker = hypre_CTAlloc(HYPRE_Int, n_fine); + if (num_cols_A_offd) + P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); + else + P_marker_offd = NULL; + + for (i = 0; i < n_fine; i++) + { + P_marker[i] = -1; + } + for (i = 0; i < num_cols_A_offd; i++) + { + P_marker_offd[i] = -1; + } + for (i = ns; i < ne; i++) + { + /*-------------------------------------------------------------------- + * If i is a c-point, interpolation is the identity. + *--------------------------------------------------------------------*/ + if (CF_marker[i] >= 0) + { + P_diag_i[i] = jj_counter; + P_diag_j[jj_counter] = fine_to_coarse[i]; + P_diag_data[jj_counter] = one; + jj_counter++; + } + /*-------------------------------------------------------------------- + * If i is an F-point, build interpolation. + *--------------------------------------------------------------------*/ + else + { + /* Diagonal part of P */ + P_diag_i[i] = jj_counter; + for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) + { + i1 = A_diag_j[jj]; + + /*-------------------------------------------------------------- + * If neighbor i1 is a C-point, set column number in P_diag_j + * and initialize interpolation weight to zero. + *--------------------------------------------------------------*/ + + if (CF_marker[i1] >= 0) + { + P_marker[i1] = jj_counter; + P_diag_j[jj_counter] = fine_to_coarse[i1]; + if(method == 0) + { + P_diag_data[jj_counter] = 0.0; + } + else if (method == 1) + { + P_diag_data[jj_counter] = - A_diag_data[jj]; + } + else if (method == 2) + { + P_diag_data[jj_counter] = - A_diag_data[jj]*a_diag[i]; + + } + jj_counter++; + } + } + + /* Off-Diagonal part of P */ + P_offd_i[i] = jj_counter_offd; + + if (num_procs > 1) + { + for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++) + { + i1 = A_offd_j[jj]; + + /*----------------------------------------------------------- + * If neighbor i1 is a C-point, set column number in P_offd_j + * and initialize interpolation weight to zero. + *-----------------------------------------------------------*/ + + if (CF_marker_offd[i1] >= 0) + { + P_marker_offd[i1] = jj_counter_offd; + /*P_offd_j[jj_counter_offd] = fine_to_coarse_offd[i1];*/ + P_offd_j[jj_counter_offd] = i1; + if(method == 0) + { + P_offd_data[jj_counter_offd] = 0.0; + } + else if (method == 1) + { + P_offd_data[jj_counter_offd] = - A_offd_data[jj]; + } + else if (method == 2) + { + P_offd_data[jj_counter_offd] = - A_offd_data[jj]*a_diag[i]; + } + + jj_counter_offd++; + } + } + } + } + P_offd_i[i+1] = jj_counter_offd; + } + hypre_TFree(P_marker); + hypre_TFree(P_marker_offd); + } + hypre_TFree(a_diag); + P = hypre_ParCSRMatrixCreate(comm, + hypre_ParCSRMatrixGlobalNumRows(A), + total_global_cpts, + hypre_ParCSRMatrixColStarts(A), + num_cpts_global, + 0, + P_diag_i[n_fine], + P_offd_i[n_fine]); + + P_diag = hypre_ParCSRMatrixDiag(P); + hypre_CSRMatrixData(P_diag) = P_diag_data; + hypre_CSRMatrixI(P_diag) = P_diag_i; + hypre_CSRMatrixJ(P_diag) = P_diag_j; + P_offd = hypre_ParCSRMatrixOffd(P); + hypre_CSRMatrixData(P_offd) = P_offd_data; + hypre_CSRMatrixI(P_offd) = P_offd_i; + hypre_CSRMatrixJ(P_offd) = P_offd_j; + hypre_ParCSRMatrixOwnsRowStarts(P) = 0; + + num_cols_P_offd = 0; + + if (P_offd_size) + { + P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i=0; i < num_cols_A_offd; i++) + P_marker[i] = 0; + num_cols_P_offd = 0; + for (i=0; i < P_offd_size; i++) + { + index = P_offd_j[i]; + if (!P_marker[index]) + { + num_cols_P_offd++; + P_marker[index] = 1; + } + } + + col_map_offd_P = hypre_CTAlloc(HYPRE_Int,num_cols_P_offd); + index = 0; + for (i=0; i < num_cols_P_offd; i++) + { + while (P_marker[index]==0) index++; + col_map_offd_P[i] = index++; + } + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i=0; i < P_offd_size; i++) + P_offd_j[i] = hypre_BinarySearch(col_map_offd_P, + P_offd_j[i], + num_cols_P_offd); + hypre_TFree(P_marker); + } + + for (i=0; i < n_fine; i++) + if (CF_marker[i] == -3) CF_marker[i] = -1; + if (num_cols_P_offd) + { + hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P; + hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd; + } + hypre_GetCommPkgRTFromCommPkgA(P,A, fine_to_coarse_offd); + + *P_ptr = P; + + hypre_TFree(CF_marker_offd); + hypre_TFree(int_buf_data); + hypre_TFree(fine_to_coarse); + hypre_TFree(fine_to_coarse_offd); + hypre_TFree(coarse_counter); + hypre_TFree(jj_count); + hypre_TFree(jj_count_offd); + + return(0); +} + + +/* Interpolation for MGR - Dynamic Row Sum method */ + +HYPRE_Int +hypre_MGRBuildPDRS( hypre_ParCSRMatrix *A, + HYPRE_Int *CF_marker, + HYPRE_Int *num_cpts_global, + HYPRE_Int blk_size, + HYPRE_Int reserved_coarse_size, + HYPRE_Int debug_flag, + hypre_ParCSRMatrix **P_ptr) +{ + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A); + hypre_ParCSRCommHandle *comm_handle; + + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); + + hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A); + HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd); + HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd); + HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd); + HYPRE_Int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd); + HYPRE_Real *a_diag; + + hypre_ParCSRMatrix *P; + HYPRE_Int *col_map_offd_P; + + HYPRE_Int *CF_marker_offd = NULL; + + hypre_CSRMatrix *P_diag; + hypre_CSRMatrix *P_offd; + + HYPRE_Real *P_diag_data; + HYPRE_Int *P_diag_i; + HYPRE_Int *P_diag_j; + HYPRE_Real *P_offd_data; + HYPRE_Int *P_offd_i; + HYPRE_Int *P_offd_j; + + HYPRE_Int P_diag_size, P_offd_size; + + HYPRE_Int *P_marker, *P_marker_offd; + + HYPRE_Int jj_counter,jj_counter_offd; + HYPRE_Int *jj_count, *jj_count_offd; +// HYPRE_Int jj_begin_row,jj_begin_row_offd; +// HYPRE_Int jj_end_row,jj_end_row_offd; + + HYPRE_Int start_indexing = 0; /* start indexing for P_data at 0 */ + + HYPRE_Int n_fine = hypre_CSRMatrixNumRows(A_diag); + + HYPRE_Int *fine_to_coarse; + HYPRE_Int *fine_to_coarse_offd; + HYPRE_Int *coarse_counter; + HYPRE_Int coarse_shift; + HYPRE_Int total_global_cpts; + HYPRE_Int num_cols_P_offd,my_first_cpt; + + HYPRE_Int i,i1; + HYPRE_Int j,jl,jj; + HYPRE_Int start; + + HYPRE_Real one = 1.0; + + HYPRE_Int my_id; + HYPRE_Int num_procs; + HYPRE_Int num_threads; + HYPRE_Int num_sends; + HYPRE_Int index; + HYPRE_Int ns, ne, size, rest; + + HYPRE_Int *int_buf_data; + + HYPRE_Real wall_time; /* for debugging instrumentation */ + + hypre_MPI_Comm_size(comm, &num_procs); + hypre_MPI_Comm_rank(comm,&my_id); + num_threads = hypre_NumThreads(); + +#ifdef HYPRE_NO_GLOBAL_PARTITION + my_first_cpt = num_cpts_global[0]; + if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1]; + hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_INT, num_procs-1, comm); +#else + my_first_cpt = num_cpts_global[my_id]; + total_global_cpts = num_cpts_global[num_procs]; +#endif + + /*------------------------------------------------------------------- + * Get the CF_marker data for the off-processor columns + *-------------------------------------------------------------------*/ + + if (debug_flag < 0) + { + debug_flag = -debug_flag; + } + + if (debug_flag==4) wall_time = time_getWallclockSeconds(); + + if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); + + if (!comm_pkg) + { + hypre_MatvecCommPkgCreate(A); + comm_pkg = hypre_ParCSRMatrixCommPkg(A); + } + + num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, + num_sends)); + + index = 0; + for (i = 0; i < num_sends; i++) + { + start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) + int_buf_data[index++] + = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; + } + + comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, + CF_marker_offd); + hypre_ParCSRCommHandleDestroy(comm_handle); + + if (debug_flag==4) + { + wall_time = time_getWallclockSeconds() - wall_time; + hypre_printf("Proc = %d Interp: Comm 1 CF_marker = %f\n", + my_id, wall_time); + fflush(NULL); + } + + /*----------------------------------------------------------------------- + * First Pass: Determine size of P and fill in fine_to_coarse mapping. + *-----------------------------------------------------------------------*/ + + /*----------------------------------------------------------------------- + * Intialize counters and allocate mapping vector. + *-----------------------------------------------------------------------*/ + + coarse_counter = hypre_CTAlloc(HYPRE_Int, num_threads); + jj_count = hypre_CTAlloc(HYPRE_Int, num_threads); + jj_count_offd = hypre_CTAlloc(HYPRE_Int, num_threads); + + fine_to_coarse = hypre_CTAlloc(HYPRE_Int, n_fine); +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1; + + jj_counter = start_indexing; + jj_counter_offd = start_indexing; + + /*----------------------------------------------------------------------- + * Loop over fine grid. + *-----------------------------------------------------------------------*/ + +/* RDF: this looks a little tricky, but doable */ +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE +#endif +#endif + for (j = 0; j < num_threads; j++) + { + size = n_fine/num_threads; + rest = n_fine - size*num_threads; + + if (j < rest) + { + ns = j*size+j; + ne = (j+1)*size+j+1; + } + else + { + ns = j*size+rest; + ne = (j+1)*size+rest; + } + for (i = ns; i < ne; i++) + { + /*-------------------------------------------------------------------- + * If i is a C-point, interpolation is the identity. Also set up + * mapping vector. + *--------------------------------------------------------------------*/ + + if (CF_marker[i] >= 0) + { + jj_count[j]++; + fine_to_coarse[i] = coarse_counter[j]; + coarse_counter[j]++; + } + /*-------------------------------------------------------------------- + * If i is an F-point, interpolation is the approximation of A_{ff}^{-1}A_{fc} + *--------------------------------------------------------------------*/ + else + { + for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) + { + i1 = A_diag_j[jj]; + if (CF_marker[i1] >= 0) + { + jj_count[j]++; + } + } + + if (num_procs > 1) + { + for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++) + { + i1 = A_offd_j[jj]; + if (CF_marker_offd[i1] >= 0) + { + jj_count_offd[j]++; + } + } + } + } + /*-------------------------------------------------------------------- + * Set up the indexes for the DRS method + *--------------------------------------------------------------------*/ + + } + } + + /*----------------------------------------------------------------------- + * Allocate arrays. + *-----------------------------------------------------------------------*/ + for (i=0; i < num_threads-1; i++) + { + coarse_counter[i+1] += coarse_counter[i]; + jj_count[i+1] += jj_count[i]; + jj_count_offd[i+1] += jj_count_offd[i]; + } + i = num_threads-1; + jj_counter = jj_count[i]; + jj_counter_offd = jj_count_offd[i]; + + P_diag_size = jj_counter; + + P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine+1); + P_diag_j = hypre_CTAlloc(HYPRE_Int, P_diag_size); + P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_size); + + P_diag_i[n_fine] = jj_counter; + + + P_offd_size = jj_counter_offd; + + P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine+1); + P_offd_j = hypre_CTAlloc(HYPRE_Int, P_offd_size); + P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_size); + + /*----------------------------------------------------------------------- + * Intialize some stuff. + *-----------------------------------------------------------------------*/ + + jj_counter = start_indexing; + jj_counter_offd = start_indexing; + + if (debug_flag==4) + { + wall_time = time_getWallclockSeconds() - wall_time; + hypre_printf("Proc = %d Interp: Internal work 1 = %f\n", + my_id, wall_time); + fflush(NULL); + } + + /*----------------------------------------------------------------------- + * Send and receive fine_to_coarse info. + *-----------------------------------------------------------------------*/ + + if (debug_flag==4) wall_time = time_getWallclockSeconds(); + + fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE +#endif +#endif + for (j = 0; j < num_threads; j++) + { + coarse_shift = 0; + if (j > 0) coarse_shift = coarse_counter[j-1]; + size = n_fine/num_threads; + rest = n_fine - size*num_threads; + if (j < rest) + { + ns = j*size+j; + ne = (j+1)*size+j+1; + } + else + { + ns = j*size+rest; + ne = (j+1)*size+rest; + } + for (i = ns; i < ne; i++) + fine_to_coarse[i] += my_first_cpt+coarse_shift; + } + + index = 0; + for (i = 0; i < num_sends; i++) + { + start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) + int_buf_data[index++] + = fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; + } + + comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, + fine_to_coarse_offd); + + hypre_ParCSRCommHandleDestroy(comm_handle); + + if (debug_flag==4) + { + wall_time = time_getWallclockSeconds() - wall_time; + hypre_printf("Proc = %d Interp: Comm 4 FineToCoarse = %f\n", + my_id, wall_time); + fflush(NULL); + } + + if (debug_flag==4) wall_time = time_getWallclockSeconds(); + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt; + + /*----------------------------------------------------------------------- + * Loop over fine grid points. + *-----------------------------------------------------------------------*/ + a_diag = hypre_CTAlloc(HYPRE_Real, n_fine); + for (i = 0; i < n_fine; i++) + { + for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) + { + i1 = A_diag_j[jj]; + if ( i==i1 ) /* diagonal of A only */ + { + a_diag[i] = 1.0/A_diag_data[jj]; + } + } + } + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i,j,jl,i1,jj,ns,ne,size,rest,P_marker,P_marker_offd,jj_counter,jj_counter_offd,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE +#endif +#endif + for (jl = 0; jl < num_threads; jl++) + { + size = n_fine/num_threads; + rest = n_fine - size*num_threads; + if (jl < rest) + { + ns = jl*size+jl; + ne = (jl+1)*size+jl+1; + } + else + { + ns = jl*size+rest; + ne = (jl+1)*size+rest; + } + jj_counter = 0; + if (jl > 0) jj_counter = jj_count[jl-1]; + jj_counter_offd = 0; + if (jl > 0) jj_counter_offd = jj_count_offd[jl-1]; + P_marker = hypre_CTAlloc(HYPRE_Int, n_fine); + if (num_cols_A_offd) + P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); + else + P_marker_offd = NULL; + + for (i = 0; i < n_fine; i++) + { + P_marker[i] = -1; + } + for (i = 0; i < num_cols_A_offd; i++) + { + P_marker_offd[i] = -1; + } + for (i = ns; i < ne; i++) + { + /*-------------------------------------------------------------------- + * If i is a c-point, interpolation is the identity. + *--------------------------------------------------------------------*/ + if (CF_marker[i] >= 0) + { + P_diag_i[i] = jj_counter; + P_diag_j[jj_counter] = fine_to_coarse[i]; + P_diag_data[jj_counter] = one; + jj_counter++; + } + /*-------------------------------------------------------------------- + * If i is an F-point, build interpolation. + *--------------------------------------------------------------------*/ + else + { + /* Diagonal part of P */ + P_diag_i[i] = jj_counter; + for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) + { + i1 = A_diag_j[jj]; + + /*-------------------------------------------------------------- + * If neighbor i1 is a C-point, set column number in P_diag_j + * and initialize interpolation weight to zero. + *--------------------------------------------------------------*/ + + if (CF_marker[i1] >= 0) + { + P_marker[i1] = jj_counter; + P_diag_j[jj_counter] = fine_to_coarse[i1]; + P_diag_data[jj_counter] = - A_diag_data[jj]*a_diag[i]; + + jj_counter++; + } + } + + /* Off-Diagonal part of P */ + P_offd_i[i] = jj_counter_offd; + + if (num_procs > 1) + { + for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++) + { + i1 = A_offd_j[jj]; + + /*----------------------------------------------------------- + * If neighbor i1 is a C-point, set column number in P_offd_j + * and initialize interpolation weight to zero. + *-----------------------------------------------------------*/ + + if (CF_marker_offd[i1] >= 0) + { + P_marker_offd[i1] = jj_counter_offd; + /*P_offd_j[jj_counter_offd] = fine_to_coarse_offd[i1];*/ + P_offd_j[jj_counter_offd] = i1; + P_offd_data[jj_counter_offd] = - A_offd_data[jj]*a_diag[i]; + + jj_counter_offd++; + } + } + } + } + P_offd_i[i+1] = jj_counter_offd; + } + hypre_TFree(P_marker); + hypre_TFree(P_marker_offd); + } + hypre_TFree(a_diag); + P = hypre_ParCSRMatrixCreate(comm, + hypre_ParCSRMatrixGlobalNumRows(A), + total_global_cpts, + hypre_ParCSRMatrixColStarts(A), + num_cpts_global, + 0, + P_diag_i[n_fine], + P_offd_i[n_fine]); + + P_diag = hypre_ParCSRMatrixDiag(P); + hypre_CSRMatrixData(P_diag) = P_diag_data; + hypre_CSRMatrixI(P_diag) = P_diag_i; + hypre_CSRMatrixJ(P_diag) = P_diag_j; + P_offd = hypre_ParCSRMatrixOffd(P); + hypre_CSRMatrixData(P_offd) = P_offd_data; + hypre_CSRMatrixI(P_offd) = P_offd_i; + hypre_CSRMatrixJ(P_offd) = P_offd_j; + hypre_ParCSRMatrixOwnsRowStarts(P) = 0; + + num_cols_P_offd = 0; + + if (P_offd_size) + { + P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd); +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i=0; i < num_cols_A_offd; i++) + P_marker[i] = 0; + num_cols_P_offd = 0; + for (i=0; i < P_offd_size; i++) + { + index = P_offd_j[i]; + if (!P_marker[index]) + { + num_cols_P_offd++; + P_marker[index] = 1; + } + } + + col_map_offd_P = hypre_CTAlloc(HYPRE_Int,num_cols_P_offd); + index = 0; + for (i=0; i < num_cols_P_offd; i++) + { + while (P_marker[index]==0) index++; + col_map_offd_P[i] = index++; + } + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i=0; i < P_offd_size; i++) + P_offd_j[i] = hypre_BinarySearch(col_map_offd_P, + P_offd_j[i], + num_cols_P_offd); + hypre_TFree(P_marker); + } + + for (i=0; i < n_fine; i++) + if (CF_marker[i] == -3) CF_marker[i] = -1; + if (num_cols_P_offd) + { + hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P; + hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd; + } + hypre_GetCommPkgRTFromCommPkgA(P,A, fine_to_coarse_offd); + + *P_ptr = P; + + hypre_TFree(CF_marker_offd); + hypre_TFree(int_buf_data); + hypre_TFree(fine_to_coarse); + hypre_TFree(fine_to_coarse_offd); + hypre_TFree(coarse_counter); + hypre_TFree(jj_count); + hypre_TFree(jj_count_offd); + + return(0); +} + +/* Setup interpolation operator */ +HYPRE_Int +hypre_MGRBuildInterp(hypre_ParCSRMatrix *A, + HYPRE_Int *CF_marker, + hypre_ParCSRMatrix *S, + HYPRE_Int *num_cpts_global, + HYPRE_Int num_functions, + HYPRE_Int *dof_func, + HYPRE_Int debug_flag, + HYPRE_Real trunc_factor, + HYPRE_Int max_elmts, + HYPRE_Int *col_offd_S_to_A, + hypre_ParCSRMatrix **P, + HYPRE_Int last_level, + HYPRE_Int method, + HYPRE_Int numsweeps) +{ + HYPRE_Int i; + hypre_ParCSRMatrix *P_ptr = NULL; + HYPRE_Real jac_trunc_threshold = trunc_factor; + HYPRE_Real jac_trunc_threshold_minus = 0.5*jac_trunc_threshold; + + /* Build interpolation operator using (hypre default) */ + if(!last_level) + { + hypre_MGRBuildP( A,CF_marker,num_cpts_global,2,debug_flag,&P_ptr); + +// hypre_BoomerAMGBuildInterp(A, CF_marker, S, num_cpts_global,1, NULL,debug_flag, +// trunc_factor, max_elmts, col_offd_S_to_A, &P_ptr); + } + /* Do Jacobi interpolation for last level */ + else + { + if (method <3) + { + hypre_MGRBuildP( A,CF_marker,num_cpts_global,method,debug_flag,&P_ptr); + /* Could do a few sweeps of Jacobi to further improve P */ + // for(i=0; i block_size); + HYPRE_Int reserved_coarse_size = (mgr_data -> reserved_coarse_size); + + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); + + hypre_ParCSRMatrix *B; + + hypre_CSRMatrix *B_diag; + HYPRE_Real *B_diag_data; + HYPRE_Int *B_diag_i; + HYPRE_Int *B_diag_j; + + hypre_CSRMatrix *B_offd; + HYPRE_Int i,ii; + HYPRE_Int j,jj; + HYPRE_Int k; + + HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag); + HYPRE_Int n_block, left_size,inv_size; + +// HYPRE_Real wall_time; /* for debugging instrumentation */ + HYPRE_Int bidx,bidxm1,bidxp1; + HYPRE_Real * diaginv; + + const HYPRE_Int nb2 = blk_size*blk_size; + + HYPRE_Int block_scaling_error = 0; + + hypre_MPI_Comm_size(comm,&num_procs); + hypre_MPI_Comm_rank(comm,&my_id); +// HYPRE_Int num_threads = hypre_NumThreads(); + + //printf("n = %d\n",n); + + if (my_id == num_procs) + { + n_block = (n - reserved_coarse_size) / blk_size; + left_size = n - blk_size*n_block; + } + else + { + n_block = n / blk_size; + left_size = n - blk_size*n_block; + } + + inv_size = nb2*n_block + left_size*left_size; + + //printf("inv_size = %d\n",inv_size); + + hypre_blockRelax_setup(A,blk_size,reserved_coarse_size,&(mgr_data -> diaginv)); + +// if (debug_flag==4) wall_time = time_getWallclockSeconds(); + + /*----------------------------------------------------------------------- + * First Pass: Determine size of B and fill in + *-----------------------------------------------------------------------*/ + + B_diag_i = hypre_CTAlloc(HYPRE_Int, n+1); + B_diag_j = hypre_CTAlloc(HYPRE_Int, inv_size); + B_diag_data = hypre_CTAlloc(HYPRE_Real, inv_size); + + B_diag_i[n] = inv_size; + + //B_offd_i = hypre_CTAlloc(HYPRE_Int, n+1); + //B_offd_j = hypre_CTAlloc(HYPRE_Int, 1); + //B_offd_data = hypre_CTAlloc(HYPRE_Real,1); + + //B_offd_i[n] = 1; + /*----------------------------------------------------------------- + * Get all the diagonal sub-blocks + *-----------------------------------------------------------------*/ + diaginv = hypre_CTAlloc(HYPRE_Real, nb2); + //printf("n_block = %d\n",n_block); + for (i = 0;i < n_block; i++) + { + bidxm1 = i*blk_size; + bidxp1 = (i+1)*blk_size; + + for (k = 0;k < blk_size; k++) + { + for (j = 0;j < blk_size; j++) + { + bidx = k*blk_size + j; + diaginv[bidx] = 0.0; + } + + for (ii = A_diag_i[bidxm1+k]; ii < A_diag_i[bidxm1+k+1]; ii++) + { + + jj = A_diag_j[ii]; + + if (jj >= bidxm1 && jj < bidxp1 && fabs(A_diag_data[ii]) > SMALLREAL) + { + bidx = k*blk_size + jj - bidxm1; + //printf("jj = %d,val = %e, bidx = %d\n",jj,A_diag_data[ii],bidx); + diaginv[bidx] = A_diag_data[ii]; + } + } + } + + /* for (k = 0;k < blk_size; k++) */ + /* { */ + /* for (j = 0;j < blk_size; j++) */ + /* { */ + /* bidx = k*blk_size + j; */ + /* printf("diaginv[%d] = %e\n",bidx,diaginv[bidx]); */ + /* } */ + /* } */ + + hypre_blas_mat_inv(diaginv, blk_size); + + for (k = 0;k < blk_size; k++) + { + B_diag_i[i*blk_size+k] = i*nb2 + k*blk_size; + //B_offd_i[i*nb2+k] = 0; + + for (j = 0;j < blk_size; j++) + { + bidx = i*nb2 + k*blk_size + j; + B_diag_j[bidx] = i*blk_size + j; + B_diag_data[bidx] = diaginv[k*blk_size + j]; + } + } + } + + //printf("Before create\n"); + B = hypre_ParCSRMatrixCreate(comm, + hypre_ParCSRMatrixGlobalNumRows(A), + hypre_ParCSRMatrixGlobalNumCols(A), + hypre_ParCSRMatrixRowStarts(A), + hypre_ParCSRMatrixColStarts(A), + 0, + inv_size, + 0); + //printf("After create\n"); + B_diag = hypre_ParCSRMatrixDiag(B); + hypre_CSRMatrixData(B_diag) = B_diag_data; + hypre_CSRMatrixI(B_diag) = B_diag_i; + hypre_CSRMatrixJ(B_diag) = B_diag_j; + B_offd = hypre_ParCSRMatrixOffd(B); + hypre_CSRMatrixData(B_offd) = NULL; + hypre_CSRMatrixI(B_offd) = NULL; + hypre_CSRMatrixJ(B_offd) = NULL; + /* hypre_ParCSRMatrixOwnsRowStarts(B) = 0; */ + + *B_ptr = B; + + return(block_scaling_error); +} + +HYPRE_Int hypre_block_jacobi (hypre_ParCSRMatrix *A, + hypre_ParVector *f, + hypre_ParVector *u, + HYPRE_Real blk_size, + HYPRE_Int n_block, + HYPRE_Int left_size, + HYPRE_Real *diaginv, + hypre_ParVector *Vtemp) +{ + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); + hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A); + HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd); + HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd); + HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd); + hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A); + hypre_ParCSRCommHandle *comm_handle; + + HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag); + HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd); + + hypre_Vector *u_local = hypre_ParVectorLocalVector(u); + HYPRE_Real *u_data = hypre_VectorData(u_local); + + hypre_Vector *f_local = hypre_ParVectorLocalVector(f); + HYPRE_Real *f_data = hypre_VectorData(f_local); + + hypre_Vector *Vtemp_local = hypre_ParVectorLocalVector(Vtemp); + HYPRE_Real *Vtemp_data = hypre_VectorData(Vtemp_local); + HYPRE_Real *Vext_data = NULL; + HYPRE_Real *v_buf_data; + + HYPRE_Int i, j, k; + HYPRE_Int ii, jj; + HYPRE_Int bidx,bidx1; + HYPRE_Int relax_error = 0; + HYPRE_Int num_sends; + HYPRE_Int index, start; + HYPRE_Int num_procs, my_id; + HYPRE_Real *res; + + const HYPRE_Int nb2 = blk_size*blk_size; + + hypre_MPI_Comm_size(comm,&num_procs); + hypre_MPI_Comm_rank(comm,&my_id); +// HYPRE_Int num_threads = hypre_NumThreads(); + + res = hypre_CTAlloc(HYPRE_Real, blk_size); + + if (num_procs > 1) + { + num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + + v_buf_data = hypre_CTAlloc(HYPRE_Real, + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends)); + + Vext_data = hypre_CTAlloc(HYPRE_Real,num_cols_offd); + + if (num_cols_offd) + { + A_offd_j = hypre_CSRMatrixJ(A_offd); + A_offd_data = hypre_CSRMatrixData(A_offd); + } + + index = 0; + for (i = 0; i < num_sends; i++) + { + start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) + v_buf_data[index++] + = u_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; + } + + comm_handle = hypre_ParCSRCommHandleCreate( 1, comm_pkg, v_buf_data, + Vext_data); + } + + /*----------------------------------------------------------------- + * Copy current approximation into temporary vector. + *-----------------------------------------------------------------*/ + +#if 0 +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif +#endif + for (i = 0; i < n; i++) + { + Vtemp_data[i] = u_data[i]; + //printf("u_old[%d] = %e\n",i,Vtemp_data[i]); + } + if (num_procs > 1) + { + hypre_ParCSRCommHandleDestroy(comm_handle); + comm_handle = NULL; + } + + /*----------------------------------------------------------------- + * Relax points block by block + *-----------------------------------------------------------------*/ + for (i = 0;i < n_block; i++) + { + for (j = 0;j < blk_size; j++) + { + bidx = i*blk_size +j; + res[j] = f_data[bidx]; + for (jj = A_diag_i[bidx]; jj < A_diag_i[bidx+1]; jj++) + { + ii = A_diag_j[jj]; + res[j] -= A_diag_data[jj] * Vtemp_data[ii]; + //printf("%d: Au= %e * %e =%e\n",ii,A_diag_data[jj],Vtemp_data[ii], res[j]); + } + for (jj = A_offd_i[bidx]; jj < A_offd_i[bidx+1]; jj++) + { + ii = A_offd_j[jj]; + res[j] -= A_offd_data[jj] * Vext_data[ii]; + } + //printf("%d: res = %e\n",bidx,res[j]); + } + + for (j = 0;j < blk_size; j++) + { + bidx1 = i*blk_size +j; + for (k = 0;k < blk_size; k++) + { + bidx = i*nb2 +j*blk_size+k; + u_data[bidx1] += res[k]*diaginv[bidx]; + //printf("u[%d] = %e, diaginv[%d] = %e\n",bidx1,u_data[bidx1],bidx,diaginv[bidx]); + } + //printf("u[%d] = %e\n",bidx1,u_data[bidx1]); + } + } + + if (num_procs > 1) + { + hypre_TFree(Vext_data); + hypre_TFree(v_buf_data); + } + hypre_TFree(res); + return(relax_error); +} + +/*Block smoother*/ +HYPRE_Int +hypre_blockRelax_setup(hypre_ParCSRMatrix *A, + HYPRE_Int blk_size, + HYPRE_Int reserved_coarse_size, + HYPRE_Real **diaginvptr) +{ + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); + HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag); + + HYPRE_Int i, j,k; + HYPRE_Int ii, jj; + HYPRE_Int bidx,bidxm1,bidxp1; + HYPRE_Int num_procs, my_id; + + const HYPRE_Int nb2 = blk_size*blk_size; + HYPRE_Int n_block; + HYPRE_Int left_size,inv_size; + HYPRE_Real *diaginv = *diaginvptr; + + + hypre_MPI_Comm_size(comm,&num_procs); + hypre_MPI_Comm_rank(comm,&my_id); +// HYPRE_Int num_threads = hypre_NumThreads(); + + if (my_id == num_procs) + { + n_block = (n - reserved_coarse_size) / blk_size; + left_size = n - blk_size*n_block; + } + else + { + n_block = n / blk_size; + left_size = n - blk_size*n_block; + } + + inv_size = nb2*n_block + left_size*left_size; + + if (diaginv !=NULL) + { + hypre_TFree(diaginv); + diaginv = hypre_CTAlloc(HYPRE_Real, inv_size); + } + else { + diaginv = hypre_CTAlloc(HYPRE_Real, inv_size); + } + + /*----------------------------------------------------------------- + * Get all the diagonal sub-blocks + *-----------------------------------------------------------------*/ + for (i = 0;i < n_block; i++) + { + bidxm1 = i*blk_size; + bidxp1 = (i+1)*blk_size; + //printf("bidxm1 = %d,bidxp1 = %d\n",bidxm1,bidxp1); + + for (k = 0;k < blk_size; k++) + { + for (j = 0;j < blk_size; j++) + { + bidx = i*nb2 + k*blk_size + j; + diaginv[bidx] = 0.0; + } + + for (ii = A_diag_i[bidxm1+k]; ii < A_diag_i[bidxm1+k+1]; ii++) + { + + jj = A_diag_j[ii]; + + if (jj >= bidxm1 && jj < bidxp1 && fabs(A_diag_data[ii]) > SMALLREAL) + { + bidx = i*nb2 + k*blk_size + jj - bidxm1; + //printf("jj = %d,val = %e, bidx = %d\n",jj,A_diag_data[ii],bidx); + diaginv[bidx] = A_diag_data[ii]; + } + } + } + } + + + + for (i = 0;i < left_size; i++) + { + bidxm1 =n_block*nb2 + i*blk_size; + bidxp1 =n_block*nb2 + (i+1)*blk_size; + for (j = 0;j < left_size; j++) + { + bidx = n_block*nb2 + i*blk_size +j; + diaginv[bidx] = 0.0; + } + + for (ii = A_diag_i[n_block*blk_size + i]; ii < A_diag_i[n_block*blk_size+i+1]; ii++) + { + jj = A_diag_j[ii]; + if (jj > n_block*blk_size) + { + bidx = n_block*nb2 + i*blk_size + jj - n_block*blk_size; + diaginv[bidx] = A_diag_data[ii]; + } + } + } + + + /*----------------------------------------------------------------- + * compute the inverses of all the diagonal sub-blocks + *-----------------------------------------------------------------*/ + if (blk_size > 1) + { + for (i = 0;i < n_block; i++) + { + hypre_blas_mat_inv(diaginv+i*nb2, blk_size); + } + hypre_blas_mat_inv(diaginv+(HYPRE_Int)(blk_size*nb2),left_size); + } + else + { + for (i = 0;i < n; i++) + { + // FIX-ME: zero-diagonal should be tested previously + if (fabs(diaginv[i]) < SMALLREAL) + diaginv[i] = 0.0; + else + diaginv[i] = 1.0 / diaginv[i]; + } + } + + *diaginvptr = diaginv; + + return 1; +} + +HYPRE_Int +hypre_blockRelax(hypre_ParCSRMatrix *A, + hypre_ParVector *f, + hypre_ParVector *u, + HYPRE_Int blk_size, + HYPRE_Int reserved_coarse_size, + hypre_ParVector *Vtemp, + hypre_ParVector *Ztemp) +{ + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); + HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag); + + HYPRE_Int i, j,k; + HYPRE_Int ii, jj; + + HYPRE_Int bidx,bidxm1,bidxp1; + HYPRE_Int relax_error = 0; + + HYPRE_Int num_procs, my_id; + + const HYPRE_Int nb2 = blk_size*blk_size; + HYPRE_Int n_block; + HYPRE_Int left_size,inv_size; + HYPRE_Real *diaginv; + + hypre_MPI_Comm_size(comm,&num_procs); + hypre_MPI_Comm_rank(comm,&my_id); + +// HYPRE_Int num_threads = hypre_NumThreads(); + + if (my_id == num_procs) + { + n_block = (n - reserved_coarse_size) / blk_size; + left_size = n - blk_size*n_block; + } + else + { + n_block = n / blk_size; + left_size = n - blk_size*n_block; + } + + inv_size = nb2*n_block + left_size*left_size; + + diaginv = hypre_CTAlloc(HYPRE_Real, inv_size); + /*----------------------------------------------------------------- + * Get all the diagonal sub-blocks + *-----------------------------------------------------------------*/ + for (i = 0;i < n_block; i++) + { + bidxm1 = i*blk_size; + bidxp1 = (i+1)*blk_size; + //printf("bidxm1 = %d,bidxp1 = %d\n",bidxm1,bidxp1); + + for (k = 0;k < blk_size; k++) + { + for (j = 0;j < blk_size; j++) + { + bidx = i*nb2 + k*blk_size + j; + diaginv[bidx] = 0.0; + } + + for (ii = A_diag_i[bidxm1+k]; ii < A_diag_i[bidxm1+k+1]; ii++) + { + + jj = A_diag_j[ii]; + + if (jj >= bidxm1 && jj < bidxp1 && fabs(A_diag_data[ii]) > SMALLREAL) + { + bidx = i*nb2 + k*blk_size + jj - bidxm1; + //printf("jj = %d,val = %e, bidx = %d\n",jj,A_diag_data[ii],bidx); + diaginv[bidx] = A_diag_data[ii]; + } + } + } + + } + + for (i = 0;i < left_size; i++) + { + bidxm1 =n_block*nb2 + i*blk_size; + bidxp1 =n_block*nb2 + (i+1)*blk_size; + for (j = 0;j < left_size; j++) + { + bidx = n_block*nb2 + i*blk_size +j; + diaginv[bidx] = 0.0; + } + + for (ii = A_diag_i[n_block*blk_size + i]; ii < A_diag_i[n_block*blk_size+i+1]; ii++) + { + jj = A_diag_j[ii]; + if (jj > n_block*blk_size) + { + bidx = n_block*nb2 + i*blk_size + jj - n_block*blk_size; + diaginv[bidx] = A_diag_data[ii]; + } + } + } +/* + for (i = 0;i < n_block; i++) + { + for (j = 0;j < blk_size; j++) + { + + for (k = 0;k < blk_size; k ++) + { + bidx = i*nb2 + j*blk_size + k; + printf("%e\t",diaginv[bidx]); + } + printf("\n"); + } + printf("\n"); + } +*/ + /*----------------------------------------------------------------- + * compute the inverses of all the diagonal sub-blocks + *-----------------------------------------------------------------*/ + if (blk_size > 1) + { + for (i = 0;i < n_block; i++) + { + hypre_blas_mat_inv(diaginv+i*nb2, blk_size); + } + hypre_blas_mat_inv(diaginv+(HYPRE_Int)(blk_size*nb2),left_size); + /* + for (i = 0;i < n_block; i++) + { + for (j = 0;j < blk_size; j++) + { + + for (k = 0;k < blk_size; k ++) + { + bidx = i*nb2 + j*blk_size + k; + printf("%e\t",diaginv[bidx]); + } + printf("\n"); + } + printf("\n"); + } + */ + } + else + { + for (i = 0;i < n; i++) + { + // FIX-ME: zero-diagonal should be tested previously + if (fabs(diaginv[i]) < SMALLREAL) + diaginv[i] = 0.0; + else + diaginv[i] = 1.0 / diaginv[i]; + } + + } + + hypre_block_jacobi(A,f,u,blk_size,n_block,left_size,diaginv,Vtemp); + + /*----------------------------------------------------------------- + * Free temperary memeory + *-----------------------------------------------------------------*/ + hypre_TFree(diaginv); + + return(relax_error); +} + +/* set coarse grid solver */ +HYPRE_Int +hypre_MGRSetCoarseSolver( void *mgr_vdata, + HYPRE_Int (*coarse_grid_solver_solve)(void*,void*,void*,void*), + HYPRE_Int (*coarse_grid_solver_setup)(void*,void*,void*,void*), + void *coarse_grid_solver ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + if (!mgr_data) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + (mgr_data -> coarse_grid_solver_solve) = coarse_grid_solver_solve; + (mgr_data -> coarse_grid_solver_setup) = coarse_grid_solver_setup; + (mgr_data -> coarse_grid_solver) = (HYPRE_Solver) coarse_grid_solver; + + (mgr_data -> use_default_cgrid_solver) = 0; + + return hypre_error_flag; +} + +/* Set the maximum number of coarse levels. + * maxcoarselevs = 1 yields the default 2-grid scheme. +*/ +HYPRE_Int +hypre_MGRSetMaxCoarseLevels( void *mgr_vdata, HYPRE_Int maxcoarselevs ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> max_num_coarse_levels) = maxcoarselevs; + return hypre_error_flag; +} +/* Set the system block size */ +HYPRE_Int +hypre_MGRSetBlockSize( void *mgr_vdata, HYPRE_Int bsize ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> block_size) = bsize; + return hypre_error_flag; +} +/* Set the relaxation type for the fine levels of the reduction. + * Currently supports the following flavors of relaxation types + * as described in the documentation: + * relax_types 0 - 8, 13, 14, 18, 19, 98. + * See par_relax.c and par_relax_more.c for more details. + * +*/ +HYPRE_Int +hypre_MGRSetRelaxType( void *mgr_vdata, HYPRE_Int relax_type ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> relax_type) = relax_type; + return hypre_error_flag; +} + +/* Set the number of relaxation sweeps */ +HYPRE_Int +hypre_MGRSetNumRelaxSweeps( void *mgr_vdata, HYPRE_Int nsweeps ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> num_relax_sweeps) = nsweeps; + return hypre_error_flag; +} + +/* Set the F-relaxation strategy: 0=single level, 1=multi level +*/ +HYPRE_Int +hypre_MGRSetFRelaxMethod( void *mgr_vdata, HYPRE_Int relax_method ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> Frelax_method) = relax_method; + return hypre_error_flag; +} +/* Set the type of the restriction type + * for computing restriction operator +*/ +HYPRE_Int +hypre_MGRSetRestrictType( void *mgr_vdata, HYPRE_Int restrict_type) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> restrict_type) = restrict_type; + return hypre_error_flag; +} + +/* Set the type of the interpolation + * for computing interpolation operator +*/ +HYPRE_Int +hypre_MGRSetInterpType( void *mgr_vdata, HYPRE_Int interpType) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> interp_type) = interpType; + return hypre_error_flag; +} +/* Set the number of Jacobi interpolation iterations + * for computing interpolation operator +*/ +HYPRE_Int +hypre_MGRSetNumInterpSweeps( void *mgr_vdata, HYPRE_Int nsweeps ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> num_interp_sweeps) = nsweeps; + return hypre_error_flag; +} +/* Set print level for mgr solver */ +HYPRE_Int +hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> print_level) = print_level; + return hypre_error_flag; +} +/* Set print level for mgr solver */ +HYPRE_Int +hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> logging) = logging; + return hypre_error_flag; +} +/* Set max number of iterations for mgr solver */ +HYPRE_Int +hypre_MGRSetMaxIter( void *mgr_vdata, HYPRE_Int max_iter ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> max_iter) = max_iter; + return hypre_error_flag; +} +/* Set convergence tolerance for mgr solver */ +HYPRE_Int +hypre_MGRSetTol( void *mgr_vdata, HYPRE_Real tol ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> tol) = tol; + return hypre_error_flag; +} +/* Set max number of iterations for mgr solver */ +HYPRE_Int +hypre_MGRSetMaxGlobalsmoothIters( void *mgr_vdata, HYPRE_Int max_iter ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> global_smooth_iters) = max_iter; + return hypre_error_flag; +} +/* Set max number of iterations for mgr solver */ + +HYPRE_Int +hypre_MGRSetGlobalsmoothType( void *mgr_vdata, HYPRE_Int iter_type ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + (mgr_data -> global_smooth_type) = iter_type; + return hypre_error_flag; +} + +/* Get number of iterations for MGR solver */ +HYPRE_Int +hypre_MGRGetNumIterations( void *mgr_vdata, HYPRE_Int *num_iterations ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + if (!mgr_data) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + *num_iterations = mgr_data->num_iterations; + + return hypre_error_flag; +} + +/* Get residual norms for MGR solver */ +HYPRE_Int +hypre_MGRGetFinalRelativeResidualNorm( void *mgr_vdata, HYPRE_Real *res_norm ) +{ + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + if (!mgr_data) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + *res_norm = mgr_data->final_rel_residual_norm; + + return hypre_error_flag; +} + +HYPRE_Int +hypre_MGRBuildAff( MPI_Comm comm, HYPRE_Int local_num_variables, HYPRE_Int num_functions, + HYPRE_Int *dof_func, HYPRE_Int *CF_marker, HYPRE_Int **coarse_dof_func_ptr, HYPRE_Int **coarse_pnts_global_ptr, + hypre_ParCSRMatrix *A, HYPRE_Int debug_flag, hypre_ParCSRMatrix **P_f_ptr, hypre_ParCSRMatrix **A_ff_ptr ) +{ + HYPRE_Int *CF_marker_copy = hypre_CTAlloc(HYPRE_Int, local_num_variables); + HYPRE_Int i; + for (i = 0; i < local_num_variables; i++) { + CF_marker_copy[i] = -CF_marker[i]; + } + + hypre_BoomerAMGCoarseParms(comm, local_num_variables, 1, NULL, CF_marker_copy, coarse_dof_func_ptr, coarse_pnts_global_ptr); + hypre_MGRBuildP(A, CF_marker_copy, (*coarse_pnts_global_ptr), 0, debug_flag, P_f_ptr); + hypre_BoomerAMGBuildCoarseOperator(*P_f_ptr, A, *P_f_ptr, A_ff_ptr); + + hypre_TFree(CF_marker_copy); + return 0; +} diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr.h hypre-2.13.0/src/parcsr_ls/par_mgr.h --- hypre-2.11.2/src/parcsr_ls/par_mgr.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_mgr.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,114 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#ifndef hypre_ParMGR_DATA_HEADER +#define hypre_ParMGR_DATA_HEADER +/*-------------------------------------------------------------------------- + * hypre_ParMGRData + *--------------------------------------------------------------------------*/ +typedef struct +{ + // block data + HYPRE_Int block_size; + HYPRE_Int num_coarse_indexes; + HYPRE_Int *block_num_coarse_indexes; + HYPRE_Int **block_cf_marker; + + // initial setup data (user provided) + HYPRE_Int num_coarse_levels; + HYPRE_Int *num_coarse_per_level; + HYPRE_Int **level_coarse_indexes; + + //general data + HYPRE_Int max_num_coarse_levels; + hypre_ParCSRMatrix **A_array; + hypre_ParCSRMatrix **P_array; + hypre_ParCSRMatrix **RT_array; + hypre_ParCSRMatrix *RAP; + HYPRE_Int **CF_marker_array; + HYPRE_Int **coarse_indices_lvls; + hypre_ParVector **F_array; + hypre_ParVector **U_array; + hypre_ParVector *residual; + HYPRE_Real *rel_res_norms; + + HYPRE_Real max_row_sum; + HYPRE_Real num_interp_sweeps; + HYPRE_Int interp_type; + HYPRE_Int restrict_type; + HYPRE_Real strong_threshold; + HYPRE_Real trunc_factor; + HYPRE_Real S_commpkg_switch; + HYPRE_Int P_max_elmts; + HYPRE_Int num_iterations; + + HYPRE_Real **l1_norms; + HYPRE_Real final_rel_residual_norm; + HYPRE_Real tol; + HYPRE_Real relax_weight; + HYPRE_Int relax_type; + HYPRE_Int logging; + HYPRE_Int print_level; + HYPRE_Int max_iter; + HYPRE_Int relax_order; + HYPRE_Int num_relax_sweeps; + + HYPRE_Solver coarse_grid_solver; + HYPRE_Int (*coarse_grid_solver_setup)(void*,void*,void*,void*); + HYPRE_Int (*coarse_grid_solver_solve)(void*,void*,void*,void*); + + HYPRE_Int use_default_cgrid_solver; + HYPRE_Real omega; + + /* temp vectors for solve phase */ + hypre_ParVector *Vtemp; + hypre_ParVector *Ztemp; + hypre_ParVector *Utemp; + hypre_ParVector *Ftemp; + + HYPRE_Real *diaginv; + HYPRE_Int n_block; + HYPRE_Int left_size; + HYPRE_Int global_smooth_iters; + HYPRE_Int global_smooth_type; + HYPRE_Solver global_smoother; + /* + Number of points that remain part of the coarse grid throughout the hierarchy. + For example, number of well equations + */ + HYPRE_Int reserved_coarse_size; + HYPRE_Int *reserved_coarse_indexes; + HYPRE_Int *reserved_Cpoint_local_indexes; + + HYPRE_Int set_non_Cpoints_to_F; + + /* F-relaxation method */ + HYPRE_Int Frelax_method; + /* V-cycle F relaxation method */ + hypre_ParAMGData **FrelaxVcycleData; + + HYPRE_Int max_local_lvls; + +} hypre_ParMGRData; + + +#define FMRK -1 +#define CMRK 1 +#define UMRK 0 +#define S_CMRK 2 + +#define FPT(i, bsize) (((i) % (bsize)) == FMRK) +#define CPT(i, bsize) (((i) % (bsize)) == CMRK) + +#define SMALLREAL 1e-20 + +#endif diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr_setup.c hypre-2.13.0/src/parcsr_ls/par_mgr_setup.c --- hypre-2.11.2/src/parcsr_ls/par_mgr_setup.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_mgr_setup.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,932 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ +#include "_hypre_parcsr_ls.h" +#include "par_mgr.h" +#include "par_amg.h" + +/* Setup MGR data */ +HYPRE_Int +hypre_MGRSetup( void *mgr_vdata, + hypre_ParCSRMatrix *A, + hypre_ParVector *f, + hypre_ParVector *u ) +{ + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + HYPRE_Int cnt,i,j, final_coarse_size, block_size, idx, row, **block_cf_marker; + HYPRE_Int lev, num_coarsening_levs, last_level, num_c_levels, num_threads,nc,index_i,cflag; + HYPRE_Int debug_flag = 0; + + hypre_ParCSRMatrix *RT = NULL; + hypre_ParCSRMatrix *P = NULL; + hypre_ParCSRMatrix *S = NULL; + hypre_ParCSRMatrix *ST = NULL; + hypre_ParCSRMatrix *AT = NULL; + + HYPRE_Int * col_offd_S_to_A = NULL; + HYPRE_Int * col_offd_ST_to_AT = NULL; + HYPRE_Int * dof_func_buff = NULL; + HYPRE_Int * coarse_pnts_global = NULL; + HYPRE_Real **l1_norms = NULL; + + hypre_ParVector *Ztemp; + hypre_ParVector *Vtemp; + hypre_ParVector *Utemp; + hypre_ParVector *Ftemp; + + /* pointers to mgr data */ + HYPRE_Int use_default_cgrid_solver = (mgr_data -> use_default_cgrid_solver); + HYPRE_Int logging = (mgr_data -> logging); + HYPRE_Int print_level = (mgr_data -> print_level); + HYPRE_Int relax_type = (mgr_data -> relax_type); + HYPRE_Int relax_order = (mgr_data -> relax_order); + HYPRE_Int interp_type = (mgr_data -> interp_type); + HYPRE_Int restrict_type = (mgr_data -> restrict_type); + HYPRE_Int num_interp_sweeps = (mgr_data -> num_interp_sweeps); + HYPRE_Int num_restrict_sweeps = (mgr_data -> num_interp_sweeps); + HYPRE_Int max_elmts = (mgr_data -> P_max_elmts); + HYPRE_Real max_row_sum = (mgr_data -> max_row_sum); + HYPRE_Real strong_threshold = (mgr_data -> strong_threshold); + HYPRE_Real trunc_factor = (mgr_data -> trunc_factor); + HYPRE_Real S_commpkg_switch = (mgr_data -> S_commpkg_switch); + HYPRE_Int old_num_coarse_levels = (mgr_data -> num_coarse_levels); + HYPRE_Int max_num_coarse_levels = (mgr_data -> max_num_coarse_levels); + HYPRE_Int * reserved_Cpoint_local_indexes = (mgr_data -> reserved_Cpoint_local_indexes); + HYPRE_Int ** CF_marker_array = (mgr_data -> CF_marker_array); + hypre_ParCSRMatrix **A_array = (mgr_data -> A_array); + hypre_ParCSRMatrix **P_array = (mgr_data -> P_array); + hypre_ParCSRMatrix **RT_array = (mgr_data -> RT_array); + hypre_ParCSRMatrix *RAP_ptr = NULL; + + hypre_ParVector **F_array = (mgr_data -> F_array); + hypre_ParVector **U_array = (mgr_data -> U_array); + hypre_ParVector *residual = (mgr_data -> residual); + HYPRE_Real *rel_res_norms = (mgr_data -> rel_res_norms); + + HYPRE_Solver default_cg_solver; + HYPRE_Int (*coarse_grid_solver_setup)(void*,void*,void*,void*) = (HYPRE_Int (*)(void*, void*, void*, void*)) (mgr_data -> coarse_grid_solver_setup); + HYPRE_Int (*coarse_grid_solver_solve)(void*,void*,void*,void*) = (HYPRE_Int (*)(void*, void*, void*, void*)) (mgr_data -> coarse_grid_solver_solve); + + HYPRE_Int global_smooth_type = (mgr_data -> global_smooth_type); + + HYPRE_Int reserved_coarse_size = (mgr_data -> reserved_coarse_size); + + HYPRE_Int num_procs, my_id; + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag); + HYPRE_Int blk_size = (mgr_data -> block_size); + + hypre_ParAMGData **FrelaxVcycleData = (mgr_data -> FrelaxVcycleData); + HYPRE_Int Frelax_method = (mgr_data -> Frelax_method); + + /* ----- begin -----*/ + + num_threads = hypre_NumThreads(); + + block_size = (mgr_data -> block_size); + block_cf_marker = (mgr_data -> block_cf_marker); + + HYPRE_Int **level_coarse_indexes = NULL; + HYPRE_Int *level_coarse_size = NULL; + HYPRE_Int setNonCpointToF = (mgr_data -> set_non_Cpoints_to_F); + HYPRE_Int *reserved_coarse_indexes = (mgr_data -> reserved_coarse_indexes); + + if (print_level > 0) + { + hypre_printf("Solver info: \n"); + hypre_printf("Relax type: %d\n", relax_type); + hypre_printf("Number of relax sweeps: %d\n", (mgr_data -> num_relax_sweeps)); + hypre_printf("Interpolation type: %d\n", interp_type); + hypre_printf("Number of interpolation sweeps: %d\n", num_interp_sweeps); + hypre_printf("Restriction type: %d\n", restrict_type); + hypre_printf("Max number of iterations: %d\n", (mgr_data -> max_iter)); + hypre_printf("Max number of coarse levels: %d\n", (mgr_data -> max_num_coarse_levels)); + hypre_printf("Tolerance: %e\n", (mgr_data -> tol)); + } + + +// HYPRE_Int num_coarse_levels = (mgr_data -> max_num_coarse_levels); + + HYPRE_Int nloc = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A)); + HYPRE_Int ilower = hypre_ParCSRMatrixFirstRowIndex(A); + HYPRE_Int iupper = hypre_ParCSRMatrixLastRowIndex(A); + + /* Trivial case: simply solve the coarse level problem */ + if( block_size < 2 || (mgr_data -> max_num_coarse_levels) < 1) + { + hypre_printf("Warning: Block size is < 2 or number of coarse levels is < 1. \n"); + hypre_printf("Solving scalar problem on fine grid using coarse level solver \n"); + + if(use_default_cgrid_solver) + { + hypre_printf("No coarse grid solver provided. Using default AMG solver ... \n"); + /* create and set default solver parameters here */ + /* create and initialize default_cg_solver */ + default_cg_solver = (HYPRE_Solver) hypre_BoomerAMGCreate(); + hypre_BoomerAMGSetMaxIter ( default_cg_solver, (mgr_data -> max_iter) ); + + hypre_BoomerAMGSetRelaxOrder( default_cg_solver, 1); + hypre_BoomerAMGSetPrintLevel(default_cg_solver, 3); + /* set setup and solve functions */ + coarse_grid_solver_setup = (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSetup; + coarse_grid_solver_solve = (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSolve; + (mgr_data -> coarse_grid_solver_setup) = coarse_grid_solver_setup; + (mgr_data -> coarse_grid_solver_solve) = coarse_grid_solver_solve; + (mgr_data -> coarse_grid_solver) = default_cg_solver; + } + + // keep reserved coarse indexes to coarsest grid + if((mgr_data -> reserved_Cpoint_local_indexes) != NULL) + hypre_TFree((mgr_data -> reserved_Cpoint_local_indexes)); + if (reserved_coarse_size > 0) + { + (mgr_data -> reserved_Cpoint_local_indexes) = hypre_CTAlloc(HYPRE_Int, reserved_coarse_size); + reserved_Cpoint_local_indexes = (mgr_data -> reserved_Cpoint_local_indexes); + cnt=0; + for(i=0; icoarse_grid_solver), 25,reserved_coarse_size,reserved_Cpoint_local_indexes); + } + + /* setup coarse grid solver */ +// hypre_BoomerAMGSetMaxIter ( (mgr_data -> coarse_grid_solver), (mgr_data -> max_iter) ); +// hypre_BoomerAMGSetPrintLevel((mgr_data -> coarse_grid_solver), 3); + coarse_grid_solver_setup((mgr_data -> coarse_grid_solver), A, f, u); + (mgr_data -> num_coarse_levels) = 0; + + return hypre_error_flag; + } + +/* + if ((mgr_data -> level_coarse_indexes) != NULL) + { + for(i=0; i level_coarse_indexes)[i] != NULL) + { + hypre_TFree((mgr_data -> level_coarse_indexes)[i]); + } + } + hypre_TFree((mgr_data -> level_coarse_indexes)); + (mgr_data -> level_coarse_indexes) = NULL; + hypre_TFree((mgr_data -> num_coarse_per_level)); + (mgr_data -> num_coarse_per_level) = NULL; + } +*/ + + /* Initialize local indexes of coarse sets at different levels */ + level_coarse_indexes = hypre_CTAlloc(HYPRE_Int*, max_num_coarse_levels); + for (i = 0; i < max_num_coarse_levels; i++) + { + level_coarse_indexes[i] = hypre_CTAlloc(HYPRE_Int,nloc); + } + + level_coarse_size = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels); + + // loop over levels + for(i=0; i reserved_Cpoint_local_indexes) != NULL) + hypre_TFree((mgr_data -> reserved_Cpoint_local_indexes)); + if (reserved_coarse_size > 0) + { + (mgr_data -> reserved_Cpoint_local_indexes) = hypre_CTAlloc(HYPRE_Int, reserved_coarse_size); + reserved_Cpoint_local_indexes = (mgr_data -> reserved_Cpoint_local_indexes); +// cnt=0; + for(i=0; i level_coarse_indexes) = level_coarse_indexes; + + (mgr_data -> num_coarse_per_level) = level_coarse_size; + + /* Free Previously allocated data, if any not destroyed */ + if (A_array || P_array || RT_array || CF_marker_array) + { + for (j = 1; j < (old_num_coarse_levels); j++) + { + if (A_array[j]) + { + hypre_ParCSRMatrixDestroy(A_array[j]); + A_array[j] = NULL; + } + } + + for (j = 0; j < old_num_coarse_levels; j++) + { + if (P_array[j]) + { + hypre_ParCSRMatrixDestroy(P_array[j]); + P_array[j] = NULL; + } + + if (RT_array[j]) + { + hypre_ParCSRMatrixDestroy(RT_array[j]); + RT_array[j] = NULL; + } + + if (CF_marker_array[j]) + { + hypre_TFree(CF_marker_array[j]); + CF_marker_array[j] = NULL; + } + } + hypre_TFree(P_array); + P_array = NULL; + hypre_TFree(RT_array); + RT_array = NULL; + hypre_TFree(CF_marker_array); + CF_marker_array = NULL; + } + + /* Free previously allocated FrelaxVcycleData if not destroyed + */ + if(FrelaxVcycleData) + { + for (j = 0; j < old_num_coarse_levels; j++) + { + if (FrelaxVcycleData[j]) + { + hypre_MGRDestroyFrelaxVcycleData(FrelaxVcycleData[j]); + FrelaxVcycleData[j] = NULL; + } + } + hypre_TFree(FrelaxVcycleData); + FrelaxVcycleData = NULL; + } + // reset pointer to NULL + (mgr_data -> FrelaxVcycleData) = FrelaxVcycleData; + + /* destroy final coarse grid matrix, if not previously destroyed */ + if((mgr_data -> RAP)) + { + hypre_ParCSRMatrixDestroy((mgr_data -> RAP)); + (mgr_data -> RAP) = NULL; + } + + /* Setup for global block smoothers*/ + + hypre_MPI_Comm_size(comm,&num_procs); + hypre_MPI_Comm_rank(comm,&my_id); + if (my_id == num_procs) + { + mgr_data -> n_block = (n - reserved_coarse_size) / blk_size; + mgr_data -> left_size = n - blk_size*(mgr_data -> n_block); + } + else + { + mgr_data -> n_block = n / blk_size; + mgr_data -> left_size = n - blk_size*(mgr_data -> n_block); + } + if (global_smooth_type == 0) + { + hypre_blockRelax_setup(A,blk_size,reserved_coarse_size,&(mgr_data -> diaginv)); + } + else if (global_smooth_type == 8) + { + HYPRE_EuclidCreate(comm, &(mgr_data -> global_smoother)); + HYPRE_EuclidSetLevel(mgr_data -> global_smoother, 0); + HYPRE_EuclidSetBJ(mgr_data -> global_smoother, 1); + HYPRE_EuclidSetup(mgr_data -> global_smoother, A, f, u); + } + + + /* clear old l1_norm data, if created */ + if((mgr_data -> l1_norms)) + { + for (j = 0; j < (old_num_coarse_levels); j++) + { + if ((mgr_data -> l1_norms)[j]) + { + hypre_TFree((mgr_data -> l1_norms)[j]); + (mgr_data -> l1_norms)[j] = NULL; + } + } + hypre_TFree((mgr_data -> l1_norms)); + } + + /* setup temporary storage */ + if ((mgr_data -> Ztemp)) + { + hypre_ParVectorDestroy((mgr_data -> Ztemp)); + (mgr_data -> Ztemp) = NULL; + } + if ((mgr_data -> Vtemp)) + { + hypre_ParVectorDestroy((mgr_data -> Vtemp)); + (mgr_data -> Vtemp) = NULL; + } + if ((mgr_data -> Utemp)) + { + hypre_ParVectorDestroy((mgr_data -> Utemp)); + (mgr_data -> Utemp) = NULL; + } + if ((mgr_data -> Ftemp)) + { + hypre_ParVectorDestroy((mgr_data -> Ftemp)); + (mgr_data -> Ftemp) = NULL; + } + if ((mgr_data -> residual)) + { + hypre_ParVectorDestroy((mgr_data -> residual)); + (mgr_data -> residual) = NULL; + } + if ((mgr_data -> rel_res_norms)) + { + hypre_TFree((mgr_data -> rel_res_norms)); + (mgr_data -> rel_res_norms) = NULL; + } + + Vtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A), + hypre_ParCSRMatrixGlobalNumRows(A), + hypre_ParCSRMatrixRowStarts(A)); + hypre_ParVectorInitialize(Vtemp); + hypre_ParVectorSetPartitioningOwner(Vtemp,0); + (mgr_data ->Vtemp) = Vtemp; + + Ztemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A), + hypre_ParCSRMatrixGlobalNumRows(A), + hypre_ParCSRMatrixRowStarts(A)); + hypre_ParVectorInitialize(Ztemp); + hypre_ParVectorSetPartitioningOwner(Ztemp,0); + (mgr_data -> Ztemp) = Ztemp; + + Utemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A), + hypre_ParCSRMatrixGlobalNumRows(A), + hypre_ParCSRMatrixRowStarts(A)); + hypre_ParVectorInitialize(Utemp); + hypre_ParVectorSetPartitioningOwner(Utemp,0); + (mgr_data ->Utemp) = Utemp; + + Ftemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A), + hypre_ParCSRMatrixGlobalNumRows(A), + hypre_ParCSRMatrixRowStarts(A)); + hypre_ParVectorInitialize(Ftemp); + hypre_ParVectorSetPartitioningOwner(Ftemp,0); + (mgr_data ->Ftemp) = Ftemp; + + /* Allocate memory for level structure */ + if (A_array == NULL) + A_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels); + if (P_array == NULL && max_num_coarse_levels > 0) + P_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels); + if (RT_array == NULL && max_num_coarse_levels > 0) + RT_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels); + if (CF_marker_array == NULL) + CF_marker_array = hypre_CTAlloc(HYPRE_Int*, max_num_coarse_levels); + + /* set pointers to mgr data */ + (mgr_data -> A_array) = A_array; + (mgr_data -> P_array) = P_array; + (mgr_data -> RT_array) = RT_array; + (mgr_data -> CF_marker_array) = CF_marker_array; + + /* Set up solution and rhs arrays */ + if (F_array != NULL || U_array != NULL) + { + for (j = 1; j < old_num_coarse_levels+1; j++) + { + if (F_array[j] != NULL) + { + hypre_ParVectorDestroy(F_array[j]); + F_array[j] = NULL; + } + if (U_array[j] != NULL) + { + hypre_ParVectorDestroy(U_array[j]); + U_array[j] = NULL; + } + } + } + + if (F_array == NULL) + F_array = hypre_CTAlloc(hypre_ParVector*, max_num_coarse_levels+1); + if (U_array == NULL) + U_array = hypre_CTAlloc(hypre_ParVector*, max_num_coarse_levels+1); + + /* set solution and rhs pointers */ + F_array[0] = f; + U_array[0] = u; + + (mgr_data -> F_array) = F_array; + (mgr_data -> U_array) = U_array; + + /* begin coarsening loop */ + num_coarsening_levs = max_num_coarse_levels; + /* initialize level data matrix here */ + RAP_ptr = A; + /* loop over levels of coarsening */ + for(lev = 0; lev < num_coarsening_levs; lev++) + { + /* check if this is the last level */ + last_level = ((lev == num_coarsening_levs-1)); + + /* initialize A_array */ + A_array[lev] = RAP_ptr; + nloc = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[lev])); + + /* Compute strength matrix for interpolation operator - use default parameters, to be modified later */ + hypre_BoomerAMGCreateS(A_array[lev], strong_threshold, max_row_sum, 1, NULL, &S); + + /* use appropriate communication package for Strength matrix */ + if (strong_threshold > S_commpkg_switch) + hypre_BoomerAMGCreateSCommPkg(A_array[lev],S,&col_offd_S_to_A); + + /* Coarsen: Build CF_marker array based on rows of A */ + cflag = ((last_level || setNonCpointToF)); + hypre_MGRCoarsen(S, A_array[lev], level_coarse_size[lev], level_coarse_indexes[lev],debug_flag, &CF_marker_array[lev], cflag); + + /* Get global coarse sizes. Note that we assume num_functions = 1 + * so dof_func arrays are NULL */ + hypre_BoomerAMGCoarseParms(comm, nloc, 1, NULL, CF_marker_array[lev], &dof_func_buff,&coarse_pnts_global); + /* Compute Petrov-Galerkin operators */ + /* Interpolation operator */ + num_interp_sweeps = (mgr_data -> num_interp_sweeps); + + hypre_MGRBuildInterp(A_array[lev], CF_marker_array[lev], S, coarse_pnts_global, 1, dof_func_buff, + debug_flag, trunc_factor, max_elmts, col_offd_S_to_A, &P, 1, interp_type, num_interp_sweeps); + + P_array[lev] = P; + + /* Build AT (transpose A) */ + hypre_ParCSRMatrixTranspose(A_array[lev], &AT, 1); + + /* Build new strength matrix */ + hypre_BoomerAMGCreateS(AT, strong_threshold, max_row_sum, 1, NULL, &ST); + /* use appropriate communication package for Strength matrix */ + if (strong_threshold > S_commpkg_switch) + hypre_BoomerAMGCreateSCommPkg(AT, ST, &col_offd_ST_to_AT); + + num_restrict_sweeps = 0; /* do injection for restriction */ + hypre_MGRBuildInterp(AT, CF_marker_array[lev], ST, coarse_pnts_global, 1, dof_func_buff, + debug_flag, trunc_factor, max_elmts, col_offd_ST_to_AT, &RT, last_level, 0, num_restrict_sweeps); + + RT_array[lev] = RT; + + /* Compute RAP for next level */ + hypre_BoomerAMGBuildCoarseOperator(RT, A_array[lev], P, &RAP_ptr); + + /* Update coarse level indexes for next levels */ + if (lev < num_coarsening_levs - 1) + { + // first mark indexes to be updated + for(i=0; inum_coarse_levels) = num_c_levels; + (mgr_data->RAP) = RAP_ptr; + + /* setup default coarse grid solver */ + /* default is BoomerAMG */ + if(use_default_cgrid_solver) + { + hypre_printf("No coarse grid solver provided. Using default AMG solver ... \n"); + /* create and set default solver parameters here */ + default_cg_solver = (HYPRE_Solver) hypre_BoomerAMGCreate(); + hypre_BoomerAMGSetMaxIter ( default_cg_solver, 1 ); + hypre_BoomerAMGSetRelaxOrder( default_cg_solver, 1); + hypre_BoomerAMGSetPrintLevel(default_cg_solver, 0); + /* set setup and solve functions */ + coarse_grid_solver_setup = (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSetup; + coarse_grid_solver_solve = (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSolve; + (mgr_data -> coarse_grid_solver_setup) = coarse_grid_solver_setup; + (mgr_data -> coarse_grid_solver_solve) = coarse_grid_solver_solve; + (mgr_data -> coarse_grid_solver) = default_cg_solver; + } + // keep reserved coarse indexes to coarsest grid + if(reserved_coarse_size > 0) + HYPRE_BoomerAMGSetCpointsToKeep((mgr_data ->coarse_grid_solver), 25,reserved_coarse_size,reserved_Cpoint_local_indexes); + + /* setup coarse grid solver */ + coarse_grid_solver_setup((mgr_data -> coarse_grid_solver), RAP_ptr, F_array[num_c_levels], U_array[num_c_levels]); + + /* Setup smoother for fine grid */ + if ( relax_type == 8 || relax_type == 13 || relax_type == 14 || relax_type == 18 ) + { + l1_norms = hypre_CTAlloc(HYPRE_Real *, num_c_levels); + (mgr_data -> l1_norms) = l1_norms; + } + for (j = 0; j < num_c_levels; j++) + { + if (num_threads == 1) + { + if (relax_type == 8 || relax_type == 13 || relax_type == 14) + { + if (relax_order) + hypre_ParCSRComputeL1Norms(A_array[j], 4, CF_marker_array[j], &l1_norms[j]); + else + hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]); + } + else if (relax_type == 18) + { + if (relax_order) + hypre_ParCSRComputeL1Norms(A_array[j], 1, CF_marker_array[j], &l1_norms[j]); + else + hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]); + } + } + else + { + if (relax_type == 8 || relax_type == 13 || relax_type == 14) + { + if (relax_order) + hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, CF_marker_array[j] , &l1_norms[j]); + else + hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]); + } + else if (relax_type == 18) + { + if (relax_order) + hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, CF_marker_array[j] , &l1_norms[j]); + else + hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]); + } + } + } + + /* Setup Vcycle data for Frelax_method > 0 */ + if(Frelax_method == 1) + { + /* allocate memory and set pointer to (mgr_data -> FrelaxVcycleData) */ + if(FrelaxVcycleData == NULL) + FrelaxVcycleData = hypre_CTAlloc(hypre_ParAMGData*, max_num_coarse_levels); + (mgr_data -> FrelaxVcycleData) = FrelaxVcycleData; + /* loop over levels */ + for(i=0; i<(mgr_data->num_coarse_levels); i++) + { + FrelaxVcycleData[i] = (hypre_ParAMGData*) hypre_MGRCreateFrelaxVcycleData(); + (FrelaxVcycleData[i] -> Vtemp) = Vtemp; + (FrelaxVcycleData[i] -> Ztemp) = Ztemp; + + // setup variables for the V-cycle in the F-relaxation step // + hypre_MGRSetupFrelaxVcycleData(mgr_data, A_array[i], F_array[i], U_array[i], i); + } + } + + if ( logging > 1 ) { + + residual = + hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_array[0]), + hypre_ParCSRMatrixGlobalNumRows(A_array[0]), + hypre_ParCSRMatrixRowStarts(A_array[0]) ); + hypre_ParVectorInitialize(residual); + hypre_ParVectorSetPartitioningOwner(residual,0); + (mgr_data -> residual) = residual; + } + else{ + (mgr_data -> residual) = NULL; + } + rel_res_norms = hypre_CTAlloc(HYPRE_Real,(mgr_data -> max_iter)); + (mgr_data -> rel_res_norms) = rel_res_norms; + + /* free level_coarse_indexes data */ + if ( level_coarse_indexes != NULL) + { + for(i=0; i FrelaxVcycleData; + + HYPRE_Int i, j, num_procs, my_id; + + HYPRE_Int max_local_lvls = (mgr_data -> max_local_lvls); + HYPRE_Int lev_local; + HYPRE_Int not_finished; +// HYPRE_Int min_local_coarse_size = 0; + HYPRE_Int max_local_coarse_size = 2; + HYPRE_Int ge_relax_type = 9; + HYPRE_Int **CF_marker_array = (mgr_data -> CF_marker_array); + HYPRE_Int local_size; + HYPRE_Int local_coarse_size; + + HYPRE_Int *coarse_pnts_global_lvl = NULL; + HYPRE_Int *coarse_dof_func_lvl = NULL; + + hypre_ParCSRMatrix *RAP_local = NULL; + hypre_ParCSRMatrix *P_local = NULL; + hypre_ParCSRMatrix *S_local = NULL; + + HYPRE_Int smrk_local = -1; + + HYPRE_Int old_num_levels = (FrelaxVcycleData[lev] -> num_levels); + HYPRE_Int **CF_marker_array_local = (FrelaxVcycleData[lev] -> CF_marker_array); + HYPRE_Int *CF_marker_local = NULL; + hypre_ParCSRMatrix **A_array_local = (FrelaxVcycleData[lev] -> A_array); + hypre_ParCSRMatrix **P_array_local = (FrelaxVcycleData[lev] -> P_array); + hypre_ParVector **F_array_local = (FrelaxVcycleData[lev] -> F_array); + hypre_ParVector **U_array_local = (FrelaxVcycleData[lev] -> U_array); + + hypre_MPI_Comm_size(comm, &num_procs); + hypre_MPI_Comm_rank(comm,&my_id); + + local_size = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A)); + + /* Free any local data not previously destroyed */ + if (A_array_local || P_array_local || CF_marker_array_local) + { + for (j = 1; j < old_num_levels; j++) + { + if (A_array_local[j]) + { + hypre_ParCSRMatrixDestroy(A_array_local[j]); + A_array_local[j] = NULL; + } + } + + for (j = 0; j < old_num_levels-1; j++) + { + if (P_array_local[j]) + { + hypre_ParCSRMatrixDestroy(P_array_local[j]); + P_array_local[j] = NULL; + } + } + + for (j = 0; j < old_num_levels-1; j++) + { + if (CF_marker_array_local[j]) + { + hypre_TFree(CF_marker_array_local[j]); + CF_marker_array_local[j] = NULL; + } + } + hypre_TFree(A_array_local); + A_array_local = NULL; + hypre_TFree(P_array_local); + P_array_local = NULL; + hypre_TFree(CF_marker_array_local); + CF_marker_array_local = NULL; + } + /* free solution arrays not previously destroyed */ + if (F_array_local != NULL || U_array_local != NULL) + { + for (j = 1; j < old_num_levels; j++) + { + if (F_array_local[j] != NULL) + { + hypre_ParVectorDestroy(F_array_local[j]); + F_array_local[j] = NULL; + } + if (U_array_local[j] != NULL) + { + hypre_ParVectorDestroy(U_array_local[j]); + U_array_local[j] = NULL; + } + } + hypre_TFree(F_array_local); + F_array_local = NULL; + hypre_TFree(U_array_local); + U_array_local = NULL; + } + + /* Initialize some variables and allocate memory */ + not_finished = 1; + lev_local = 0; + if(A_array_local == NULL) + A_array_local = hypre_CTAlloc(hypre_ParCSRMatrix*, max_local_lvls); + if(P_array_local == NULL && max_local_lvls > 1) + P_array_local = hypre_CTAlloc(hypre_ParCSRMatrix*, max_local_lvls-1); + if(F_array_local == NULL) + F_array_local = hypre_CTAlloc(hypre_ParVector*, max_local_lvls); + if(U_array_local == NULL) + U_array_local = hypre_CTAlloc(hypre_ParVector*, max_local_lvls); + if(CF_marker_array_local == NULL) + CF_marker_array_local = hypre_CTAlloc(HYPRE_Int*, max_local_lvls); + + A_array_local[0] = A; + F_array_local[0] = f; + U_array_local[0] = u; + + /* Special case max_local_lvls == 1 */ + if (max_local_lvls == 1) + { + CF_marker_local = hypre_CTAlloc(HYPRE_Int, local_size ); + for (i=0; i < local_size ; i++) + CF_marker_local[i] = 1; + CF_marker_array_local[0] = CF_marker_local; + lev_local = max_local_lvls; + not_finished = 0; + } + + while (not_finished) + { + local_size = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array_local[lev_local])); + + if (lev_local == 0) { + /* use the CF_marker from the outer MGR cycle to create the strength connection matrix */ + hypre_BoomerAMGCreateSFromCFMarker(A_array_local[lev_local], 0.25, 0.9, CF_marker_array[lev], smrk_local, &S_local); + } else if (lev_local > 0) { + hypre_BoomerAMGCreateS(A_array_local[lev_local], 0.25, 0.9, 1, NULL, &S_local); + } + +// hypre_BoomerAMGCoarsenFalgout(S_local, A_array_local[lev_local], 0, 0, &CF_marker_local); + hypre_BoomerAMGCoarsen(S_local, A_array_local[lev_local], 0, 0, &CF_marker_local); + /* For the lev_local=0, the coarsening routine is called on the fine-grid (the whole matrix) + * thus, some C-points of the outer MGR level may have been set to F-points in the coarsening + * routine. We need to reset these back to C-points (before building the interpolation operator. + */ + if (lev_local == 0) { + for (i = 0; i < local_size; i++) { + if (CF_marker_array[lev][i] == 1) { + CF_marker_local[i] = 1; + } + } + } + + hypre_BoomerAMGCoarseParms(comm, local_size, + 1, NULL, CF_marker_local, + &coarse_dof_func_lvl, &coarse_pnts_global_lvl); + hypre_BoomerAMGBuildInterp(A_array_local[lev_local], CF_marker_local, + S_local, coarse_pnts_global_lvl, 1, NULL, + 0, 0.0, 0, NULL, &P_local); + + /* save the CF_marker and interpolation matrix pointers */ + CF_marker_array_local[lev_local] = CF_marker_local; + P_array_local[lev_local] = P_local; + + /* build the coarse grid */ + hypre_BoomerAMGBuildCoarseOperatorKT(P_local, A_array_local[lev_local], + P_local, 0, &RAP_local); +// hypre_printf("Coarse size lev %d = %d\n", lev_local+1, hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(RAP_local))); + +#ifdef HYPRE_NO_GLOBAL_PARTITION + if (my_id == (num_procs -1)) local_coarse_size = coarse_pnts_global_lvl[1]; + hypre_MPI_Bcast(&local_coarse_size, 1, HYPRE_MPI_INT, num_procs-1, comm); +#else + local_coarse_size = coarse_pnts_global_lvl[num_procs]; +#endif + + lev_local++; + + if (S_local) hypre_ParCSRMatrixDestroy(S_local); + S_local = NULL; + if ( (lev_local == max_local_lvls-1) || (local_coarse_size <= max_local_coarse_size) ) + { + not_finished = 0; + } + + A_array_local[lev_local] = RAP_local; + F_array_local[lev_local] = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_local), + hypre_ParCSRMatrixGlobalNumRows(RAP_local), + hypre_ParCSRMatrixRowStarts(RAP_local)); + hypre_ParVectorInitialize(F_array_local[lev_local]); + hypre_ParVectorSetPartitioningOwner(F_array_local[lev_local], 0); + + U_array_local[lev_local] = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_local), + hypre_ParCSRMatrixGlobalNumRows(RAP_local), + hypre_ParCSRMatrixRowStarts(RAP_local)); + hypre_ParVectorInitialize(U_array_local[lev_local]); + hypre_ParVectorSetPartitioningOwner(U_array_local[lev_local], 0); + + } // end while loop + + // setup Vcycle data + (FrelaxVcycleData[lev] -> A_array) = A_array_local; + (FrelaxVcycleData[lev] -> P_array) = P_array_local; + (FrelaxVcycleData[lev] -> F_array) = F_array_local; + (FrelaxVcycleData[lev] -> U_array) = U_array_local; + (FrelaxVcycleData[lev] -> CF_marker_array) = CF_marker_array_local; + (FrelaxVcycleData[lev] -> num_levels) = lev_local+1; + + if(lev_local > 1) + hypre_GaussElimSetup(FrelaxVcycleData[lev], lev_local, ge_relax_type); + + return hypre_error_flag; +} diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr_solve.c hypre-2.13.0/src/parcsr_ls/par_mgr_solve.c --- hypre-2.11.2/src/parcsr_ls/par_mgr_solve.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_mgr_solve.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,581 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + + + + + +/****************************************************************************** + * + * MGR solve routine + * + *****************************************************************************/ +#include "_hypre_parcsr_ls.h" +#include "par_mgr.h" +#include "par_amg.h" + +/*-------------------------------------------------------------------- + * hypre_MGRSolve + *--------------------------------------------------------------------*/ +HYPRE_Int +hypre_MGRSolve( void *mgr_vdata, + hypre_ParCSRMatrix *A, + hypre_ParVector *f, + hypre_ParVector *u ) +{ + + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + hypre_ParCSRMatrix **A_array = (mgr_data -> A_array); + hypre_ParVector **F_array = (mgr_data -> F_array); + hypre_ParVector **U_array = (mgr_data -> U_array); + + HYPRE_Real tol = (mgr_data -> tol); + HYPRE_Int logging = (mgr_data -> logging); + HYPRE_Int print_level = (mgr_data -> print_level); + HYPRE_Int max_iter = (mgr_data -> max_iter); + HYPRE_Real *norms = (mgr_data -> rel_res_norms); + hypre_ParVector *Vtemp = (mgr_data -> Vtemp); + hypre_ParVector *residual; + + HYPRE_Real alpha = -1; + HYPRE_Real beta = 1; + HYPRE_Real conv_factor = 0.0; + HYPRE_Real resnorm = 1.0; + HYPRE_Real init_resnorm = 0.0; + HYPRE_Real rel_resnorm; + HYPRE_Real rhs_norm = 0.0; + HYPRE_Real old_resnorm; + HYPRE_Real ieee_check = 0.; + + HYPRE_Int iter, num_procs, my_id; + HYPRE_Int Solve_err_flag; + +/* + HYPRE_Real total_coeffs; + HYPRE_Real total_variables; + HYPRE_Real operat_cmplxty; + HYPRE_Real grid_cmplxty; +*/ + HYPRE_Solver cg_solver = (mgr_data -> coarse_grid_solver); + HYPRE_Int (*coarse_grid_solver_solve)(void*,void*,void*,void*) = (mgr_data -> coarse_grid_solver_solve); + + HYPRE_Int blk_size = (mgr_data -> block_size); + HYPRE_Real *diaginv = (mgr_data -> diaginv); + HYPRE_Int n_block = (mgr_data -> n_block); + HYPRE_Int left_size = (mgr_data -> left_size); + + HYPRE_Int global_smooth_iters = (mgr_data -> global_smooth_iters); + HYPRE_Int global_smooth_type = (mgr_data -> global_smooth_type); + + HYPRE_Int i; + + if(logging > 1) + { + residual = (mgr_data -> residual); + } + + (mgr_data -> num_iterations) = 0; + + if((mgr_data -> num_coarse_levels) == 0) + { + /* Do scalar AMG solve when only one level */ + coarse_grid_solver_solve(cg_solver, A, f, u); + HYPRE_BoomerAMGGetNumIterations(cg_solver, &iter); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(cg_solver, &rel_resnorm); + (mgr_data -> num_iterations) = iter; + (mgr_data -> final_rel_residual_norm) = rel_resnorm; + return hypre_error_flag; + } + + U_array[0] = u; + F_array[0] = f; + + hypre_MPI_Comm_size(comm, &num_procs); + hypre_MPI_Comm_rank(comm,&my_id); + + /*----------------------------------------------------------------------- + * Write the solver parameters + *-----------------------------------------------------------------------*/ +// if (my_id == 0 && print_level > 1) +// hypre_MGRWriteSolverParams(mgr_data); + + /*----------------------------------------------------------------------- + * Initialize the solver error flag and assorted bookkeeping variables + *-----------------------------------------------------------------------*/ + + Solve_err_flag = 0; +/* + total_coeffs = 0; + total_variables = 0; + operat_cmplxty = 0; + grid_cmplxty = 0; +*/ + /*----------------------------------------------------------------------- + * write some initial info + *-----------------------------------------------------------------------*/ + + if (my_id == 0 && print_level > 1 && tol > 0.) + hypre_printf("\n\nTWO-GRID SOLVER SOLUTION INFO:\n"); + + + /*----------------------------------------------------------------------- + * Compute initial fine-grid residual and print + *-----------------------------------------------------------------------*/ + if (print_level > 1 || logging > 1 || tol > 0.) + { + if ( logging > 1 ) { + hypre_ParVectorCopy(F_array[0], residual ); + if (tol > 0) + hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, residual ); + resnorm = sqrt(hypre_ParVectorInnerProd( residual, residual )); + } + else { + hypre_ParVectorCopy(F_array[0], Vtemp); + if (tol > 0) + hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, Vtemp); + resnorm = sqrt(hypre_ParVectorInnerProd(Vtemp, Vtemp)); + } + + /* Since it is does not diminish performance, attempt to return an error flag + and notify users when they supply bad input. */ + if (resnorm != 0.) ieee_check = resnorm/resnorm; /* INF -> NaN conversion */ + if (ieee_check != ieee_check) + { + /* ...INFs or NaNs in input can make ieee_check a NaN. This test + for ieee_check self-equality works on all IEEE-compliant compilers/ + machines, c.f. page 8 of "Lecture Notes on the Status of IEEE 754" + by W. Kahan, May 31, 1996. Currently (July 2002) this paper may be + found at http://HTTP.CS.Berkeley.EDU/~wkahan/ieee754status/IEEE754.PDF */ + if (print_level > 0) + { + hypre_printf("\n\nERROR detected by Hypre ... BEGIN\n"); + hypre_printf("ERROR -- hypre_MGRSolve: INFs and/or NaNs detected in input.\n"); + hypre_printf("User probably placed non-numerics in supplied A, x_0, or b.\n"); + hypre_printf("ERROR detected by Hypre ... END\n\n\n"); + } + hypre_error(HYPRE_ERROR_GENERIC); + return hypre_error_flag; + } + + init_resnorm = resnorm; + rhs_norm = sqrt(hypre_ParVectorInnerProd(f, f)); + if (rhs_norm) + { + rel_resnorm = init_resnorm / rhs_norm; + } + else + { + /* rhs is zero, return a zero solution */ + hypre_ParVectorSetConstantValues(U_array[0], 0.0); + if(logging > 0) + { + rel_resnorm = 0.0; + (mgr_data -> final_rel_residual_norm) = rel_resnorm; + } + return hypre_error_flag; + } + } + else + { + rel_resnorm = 1.; + } + + if (my_id == 0 && print_level > 1) + { + hypre_printf(" relative\n"); + hypre_printf(" residual factor residual\n"); + hypre_printf(" -------- ------ --------\n"); + hypre_printf(" Initial %e %e\n",init_resnorm, + rel_resnorm); + } + /************** Main Solver Loop - always do 1 iteration ************/ + iter = 0; + while ((rel_resnorm >= tol || iter < 1) + && iter < max_iter) + { + if (global_smooth_iters) + { + if (global_smooth_type == 0)//block Jacobi smoother + { + for (i = 0;i < global_smooth_iters;i ++) + hypre_block_jacobi(A_array[0],F_array[0],U_array[0],blk_size,n_block,left_size,diaginv,Vtemp); + } + else if ((global_smooth_type > 0) && (global_smooth_type < 7)) + { + for (i = 0;i < global_smooth_iters;i ++) + hypre_BoomerAMGRelax(A_array[0], F_array[0], NULL, global_smooth_type-1, 0, 1.0, 0.0, NULL, U_array[0], Vtemp, NULL); + } + else if (global_smooth_type == 8)//ILU smoother + { + for (i = 0;i < global_smooth_iters;i ++) + HYPRE_EuclidSolve( (mgr_data -> global_smoother),A_array[0],F_array[0],U_array[0]); + } + } + + /* Do one cycle of reduction solve on Ae=r */ + hypre_MGRCycle(mgr_data, F_array, U_array); + + /*--------------------------------------------------------------- + * Compute fine-grid residual and residual norm + *----------------------------------------------------------------*/ + + if (print_level > 1 || logging > 1 || tol > 0.) + { + old_resnorm = resnorm; + + if ( logging > 1 ) { + hypre_ParVectorCopy(F_array[0], residual); + hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, residual ); + resnorm = sqrt(hypre_ParVectorInnerProd( residual, residual )); + } + else { + hypre_ParVectorCopy(F_array[0], Vtemp); + hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, Vtemp); + resnorm = sqrt(hypre_ParVectorInnerProd(Vtemp, Vtemp)); + } + + if (old_resnorm) conv_factor = resnorm / old_resnorm; + else conv_factor = resnorm; + if (rhs_norm) + { + rel_resnorm = resnorm / rhs_norm; + } + else + { + rel_resnorm = resnorm; + } + + norms[iter] = rel_resnorm; + } + + ++iter; + (mgr_data -> num_iterations) = iter; + (mgr_data -> final_rel_residual_norm) = rel_resnorm; + + if (my_id == 0 && print_level > 1) + { + hypre_printf(" MGRCycle %2d %e %f %e \n", iter, + resnorm, conv_factor, rel_resnorm); + } + } + + /* check convergence within max_iter */ + if (iter == max_iter && tol > 0.) + { + Solve_err_flag = 1; + hypre_error(HYPRE_ERROR_CONV); + } + + /*----------------------------------------------------------------------- + * Print closing statistics + * Add operator and grid complexity stats + *-----------------------------------------------------------------------*/ + + if (iter > 0 && init_resnorm) + conv_factor = pow((resnorm/init_resnorm),(1.0/(HYPRE_Real) iter)); + else + conv_factor = 1.; + + if (print_level > 1) + { + /*** compute operator and grid complexities here ?? ***/ + if (my_id == 0) + { + if (Solve_err_flag == 1) + { + hypre_printf("\n\n=============================================="); + hypre_printf("\n NOTE: Convergence tolerance was not achieved\n"); + hypre_printf(" within the allowed %d iterations\n",max_iter); + hypre_printf("=============================================="); + } + hypre_printf("\n\n Average Convergence Factor = %f \n",conv_factor); + hypre_printf(" Number of coarse levels = %d \n",(mgr_data -> num_coarse_levels)); +// hypre_printf("\n\n Complexity: grid = %f\n",grid_cmplxty); +// hypre_printf(" operator = %f\n",operat_cmplxty); +// hypre_printf(" cycle = %f\n\n\n\n",cycle_cmplxty); + } + } + + return hypre_error_flag; +} + +HYPRE_Int +hypre_MGRFrelaxVcycle ( void *Frelax_vdata ) +{ + hypre_ParAMGData *Frelax_data = (hypre_ParAMGData*) Frelax_vdata; + + HYPRE_Int Not_Finished = 1; + HYPRE_Int level = 0; + HYPRE_Int cycle_param = 1; + HYPRE_Int j, Solve_err_flag, local_size, coarse_grid, fine_grid; + + HYPRE_Int num_sweeps = 1; + HYPRE_Int relax_order = 1; + HYPRE_Int relax_type = 3; + HYPRE_Int relax_weight = 1; + HYPRE_Int omega = 1; + + hypre_ParVector **F_array = (Frelax_data) -> F_array; + hypre_ParVector **U_array = (Frelax_data) -> U_array; + + hypre_ParCSRMatrix **A_array = ((Frelax_data) -> A_array); + hypre_ParCSRMatrix **R_array = ((Frelax_data) -> P_array); + hypre_ParCSRMatrix **P_array = ((Frelax_data) -> P_array); + HYPRE_Int **CF_marker_array = ((Frelax_data) -> CF_marker_array); + + hypre_ParVector *Vtemp = (Frelax_data) -> Vtemp; + hypre_ParVector *Ztemp = (Frelax_data) -> Ztemp; + + HYPRE_Int num_c_levels = (Frelax_data) -> num_levels-1; + + hypre_ParVector *Aux_F = NULL; + hypre_ParVector *Aux_U = NULL; + + HYPRE_Real alpha, beta; + + while (Not_Finished) + { + local_size = hypre_VectorSize(hypre_ParVectorLocalVector(F_array[level])); + hypre_VectorSize(hypre_ParVectorLocalVector(Vtemp)) = local_size; + Aux_F = F_array[level]; + Aux_U = U_array[level]; + + if (cycle_param == 1) { + // visiting coarser grids and relax + // This also doubles as the smoother used when no + // coarsening is done + for (j = 0; j < num_sweeps; j++) { + Solve_err_flag = hypre_BoomerAMGRelaxIF(A_array[level], + Aux_F, + CF_marker_array[level], + relax_type, + relax_order, + cycle_param, + relax_weight, + omega, + NULL, + Aux_U, + Vtemp, + Ztemp); + } + if ((num_c_levels > 0) && (level != num_c_levels)) + { + fine_grid = level; + coarse_grid = level + 1; + + hypre_ParVectorSetConstantValues(U_array[coarse_grid], 0.0); + + alpha = -1.0; + beta = 1.0; + + // JSP: avoid unnecessary copy using out-of-place version of SpMV + hypre_ParCSRMatrixMatvecOutOfPlace(alpha, A_array[fine_grid], U_array[fine_grid], + beta, F_array[fine_grid], Vtemp); + + alpha = 1.0; + beta = 0.0; + hypre_ParCSRMatrixMatvecT(alpha,R_array[fine_grid],Vtemp, + beta,F_array[coarse_grid]); + + ++level; + cycle_param = 1; + if (level == num_c_levels) cycle_param = 3; + } + } else if (cycle_param == 3) { + // solve the coarsest grid with Gaussian elimination + hypre_GaussElimSolve(Frelax_data, level, 9); + cycle_param = 2; + } else if (cycle_param == 2) { + /*--------------------------------------------------------------- + * Visit finer level next. + * Interpolate and add correction using hypre_ParCSRMatrixMatvec. + * Reset counters and cycling parameters for finer level. + *--------------------------------------------------------------*/ + + fine_grid = level - 1; + coarse_grid = level; + alpha = 1.0; + beta = 1.0; + + hypre_ParCSRMatrixMatvec(alpha, P_array[fine_grid], + U_array[coarse_grid], + beta, U_array[fine_grid]); + + --level; + cycle_param = 2; + if (level == 0) cycle_param = 99; + } else { + Not_Finished = 0; + } + } + + return Solve_err_flag; +} + +HYPRE_Int +hypre_MGRCycle( void *mgr_vdata, + hypre_ParVector **F_array, + hypre_ParVector **U_array ) +{ +// MPI_Comm comm; + hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata; + + HYPRE_Int Solve_err_flag; + HYPRE_Int level; + HYPRE_Int coarse_grid; + HYPRE_Int fine_grid; + HYPRE_Int Not_Finished; + HYPRE_Int cycle_type; + + hypre_ParCSRMatrix **A_array = (mgr_data -> A_array); + hypre_ParCSRMatrix **RT_array = (mgr_data -> RT_array); + hypre_ParCSRMatrix **P_array = (mgr_data -> P_array); + hypre_ParCSRMatrix *RAP = (mgr_data -> RAP); + HYPRE_Solver cg_solver = (mgr_data -> coarse_grid_solver); + HYPRE_Int (*coarse_grid_solver_solve)(void*, void*, void*, void*) = (mgr_data -> coarse_grid_solver_solve); + + HYPRE_Int **CF_marker = (mgr_data -> CF_marker_array); + HYPRE_Int nsweeps = (mgr_data -> num_relax_sweeps); + HYPRE_Int relax_type = (mgr_data -> relax_type); + HYPRE_Real relax_weight = (mgr_data -> relax_weight); + HYPRE_Real omega = (mgr_data -> omega); + HYPRE_Real **relax_l1_norms = (mgr_data -> l1_norms); + hypre_ParVector *Vtemp = (mgr_data -> Vtemp); + hypre_ParVector *Ztemp = (mgr_data -> Ztemp); + + HYPRE_Int i, relax_points; + HYPRE_Int num_coarse_levels = (mgr_data -> num_coarse_levels); + + HYPRE_Real alpha; + HYPRE_Real beta; + + HYPRE_Int Frelax_method = (mgr_data -> Frelax_method); + hypre_ParAMGData **FrelaxVcycleData = (mgr_data -> FrelaxVcycleData); + + /* Initialize */ +// comm = hypre_ParCSRMatrixComm(A_array[0]); + Solve_err_flag = 0; + Not_Finished = 1; + cycle_type = 1; + level = 0; + + /***** Main loop ******/ + while (Not_Finished) + { + + /* Do coarse grid correction solve */ + if(cycle_type == 3) + { + /* call coarse grid solver here */ + /* default is BoomerAMG */ + coarse_grid_solver_solve(cg_solver, RAP, F_array[level], U_array[level]); + /**** cycle up ***/ + cycle_type = 2; + } + /* F-relaxation */ + else if(cycle_type == 1) + { + + fine_grid = level; + coarse_grid = level + 1; + /* Relax solution - F-relaxation */ + relax_points = -1; + + if (Frelax_method == 0) + { /* (single level) relaxation for A_ff */ + if (relax_type == 18) + { + hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid], + relax_points, relax_weight, relax_l1_norms[fine_grid], + U_array[fine_grid], Vtemp); + } + else if(relax_type == 8 || relax_type == 13 || relax_type == 14) + { + hypre_BoomerAMGRelax(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid], + relax_type, relax_points, relax_weight, + omega, relax_l1_norms[fine_grid], U_array[fine_grid], Vtemp, Ztemp); + } + else + { + for(i=0; i -#else -/* RDF: This needs to be integrated with the hypre blas/lapack stuff */ -#ifdef __cplusplus -extern "C" { -#endif -HYPRE_Int hypre_F90_NAME_LAPACK(dgetrf, DGETRF) (HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dgetrs, DGETRS) (const char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *b, HYPRE_Int*, HYPRE_Int *); -#ifdef __cplusplus -} -#endif -#endif +#include "_hypre_lapack.h" /*-------------------------------------------------------------------------- * hypre_BoomerAMGRelax @@ -2368,27 +2355,31 @@ /*----------------------------------------------------------------- * Copy f into temporary vector. *-----------------------------------------------------------------*/ - + PUSH_RANGE("RELAX",4); +#ifdef HYPRE_USE_GPU + hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(Vtemp)); + hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(f)); + VecCopy(Vtemp_data,f_data,hypre_VectorSize(hypre_ParVectorLocalVector(Vtemp)),HYPRE_STREAM(4)); +#else hypre_ParVectorCopy(f,Vtemp); - +#endif /*----------------------------------------------------------------- * Perform Matvec Vtemp=f-Au *-----------------------------------------------------------------*/ - hypre_ParCSRMatrixMatvec(-1.0,A, u, 1.0, Vtemp); + hypre_ParCSRMatrixMatvec(-relax_weight,A, u, relax_weight, Vtemp); +#ifdef HYPRE_USE_GPU + VecScale(u_data,Vtemp_data,l1_norms,n,HYPRE_STREAM(4)); +#else for (i = 0; i < n; i++) { - /*----------------------------------------------------------- * If diagonal is nonzero, relax point i; otherwise, skip it. *-----------------------------------------------------------*/ - - if (A_diag_data[A_diag_i[i]] != zero) - { - u_data[i] += relax_weight * Vtemp_data[i] - / A_diag_data[A_diag_i[i]]; - } + u_data[i] += Vtemp_data[i] / l1_norms[i]; } +#endif + POP_RANGE; } break; @@ -4131,28 +4122,13 @@ piv = hypre_CTAlloc(HYPRE_Int, n_global); - /* write over A with LU */ -#ifdef HYPRE_USING_ESSL - dgetrf(n_global, n_global, A_mat, n_global, piv, &info); - -#else - hypre_F90_NAME_LAPACK(dgetrf, DGETRF)(&n_global, &n_global, - A_mat, &n_global, piv, &info); -#endif + /* write over A with LU */ + hypre_dgetrf(&n_global, &n_global, A_mat, &n_global, piv, &info); - /*now b_vec = inv(A)*b_vec */ -#ifdef HYPRE_USING_ESSL - dgetrs("N", n_global, &one_i, A_mat, - n_global, piv, b_vec, - n_global, &info); + /*now b_vec = inv(A)*b_vec */ + hypre_dgetrs("N", &n_global, &one_i, A_mat, &n_global, piv, b_vec, &n_global, &info); -#else - hypre_F90_NAME_LAPACK(dgetrs, DGETRS)("N", &n_global, &one_i, A_mat, - &n_global, piv, b_vec, - &n_global, &info); -#endif hypre_TFree(piv); - for (i = 0; i < n; i++) { @@ -4336,25 +4312,11 @@ piv = hypre_CTAlloc(HYPRE_Int, n_global); /* write over A with LU */ -#ifdef HYPRE_USING_ESSL - dgetrf(n_global, n_global, A_tmp, n_global, piv, &my_info); - -#else - hypre_F90_NAME_LAPACK(dgetrf, DGETRF)(&n_global, &n_global, - A_tmp, &n_global, piv, &my_info); -#endif + hypre_dgetrf(&n_global, &n_global, A_tmp, &n_global, piv, &my_info); /*now b_vec = inv(A)*b_vec */ -#ifdef HYPRE_USING_ESSL - dgetrs("N", n_global, &one_i, A_tmp, - n_global, piv, b_vec, - n_global, &my_info); + hypre_dgetrs("N", &n_global, &one_i, A_tmp, &n_global, piv, b_vec, &n_global, &my_info); -#else - hypre_F90_NAME_LAPACK(dgetrs, DGETRS)("N", &n_global, &one_i, A_tmp, - &n_global, piv, b_vec, - &n_global, &my_info); -#endif hypre_TFree(piv); } for (i = 0; i < n; i++) @@ -4372,7 +4334,7 @@ return hypre_error_flag; } - +HYPRE_CUDA_GLOBAL HYPRE_Int gselim(HYPRE_Real *A, HYPRE_Real *x, HYPRE_Int n) diff -Nru hypre-2.11.2/src/parcsr_ls/par_relax_more.c hypre-2.13.0/src/parcsr_ls/par_relax_more.c --- hypre-2.11.2/src/parcsr_ls/par_relax_more.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_relax_more.c 2017-10-20 17:42:22.000000000 +0000 @@ -26,52 +26,61 @@ HYPRE_Real e_max; HYPRE_Real row_sum, max_norm; - HYPRE_Real *col_val; + HYPRE_Real *A_diag_data; + HYPRE_Real *A_offd_data; HYPRE_Real temp; HYPRE_Real diag_value; HYPRE_Int pos_diag, neg_diag; - HYPRE_Int start_row, end_row; - HYPRE_Int row_length; - HYPRE_Int *col_ind; + HYPRE_Int A_num_rows; + HYPRE_Int *A_diag_i; + HYPRE_Int *A_offd_i; HYPRE_Int j; - HYPRE_Int i; + HYPRE_Int i, start; /* estimate with the inf-norm of A - should be ok for SPD matrices */ - start_row = hypre_ParCSRMatrixFirstRowIndex(A); - end_row = hypre_ParCSRMatrixLastRowIndex(A); + A_num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A)); + A_diag_i = hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(A)); + A_diag_data = hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(A)); + A_offd_i = hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(A)); + A_offd_data = hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(A)); max_norm = 0.0; pos_diag = neg_diag = 0; - for ( i = start_row; i <= end_row; i++ ) + for ( i = 0; i < A_num_rows; i++ ) { - HYPRE_ParCSRMatrixGetRow((HYPRE_ParCSRMatrix) A, i, &row_length, &col_ind, &col_val); - - row_sum = 0.0; - - for (j = 0; j < row_length; j++) + start = A_diag_i[i]; + diag_value = A_diag_data[start]; + if (diag_value > 0) { - if (j==0) diag_value = fabs(col_val[j]); - - row_sum += fabs(col_val[j]); + pos_diag++; + } + if (diag_value < 0) + { + neg_diag++; + diag_value = -diag_value; + } + row_sum = diag_value; - if ( col_ind[j] == i && col_val[j] > 0.0 ) pos_diag++; - if ( col_ind[j] == i && col_val[j] < 0.0 ) neg_diag++; + /*for (j = 0; j < row_length; j++)*/ + for (j = start+1; j < A_diag_i[i+1]; j++) + { + row_sum += fabs(A_diag_data[j]); + } + for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++) + { + row_sum += fabs(A_offd_data[j]); } if (scale) { if (diag_value != 0.0) row_sum = row_sum/diag_value; } - - if ( row_sum > max_norm ) max_norm = row_sum; - - HYPRE_ParCSRMatrixRestoreRow((HYPRE_ParCSRMatrix) A, i, &row_length, &col_ind, &col_val); } /* get max across procs */ diff -Nru hypre-2.11.2/src/parcsr_ls/par_stats.c hypre-2.13.0/src/parcsr_ls/par_stats.c --- hypre-2.11.2/src/parcsr_ls/par_stats.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_stats.c 2017-10-20 17:42:22.000000000 +0000 @@ -155,6 +155,7 @@ HYPRE_Int additive; HYPRE_Int mult_additive; HYPRE_Int simple; + HYPRE_Int add_end; HYPRE_Int add_rlx; HYPRE_Real add_rlx_wt; @@ -175,10 +176,10 @@ additive = hypre_ParAMGDataAdditive(amg_data); mult_additive = hypre_ParAMGDataMultAdditive(amg_data); simple = hypre_ParAMGDataSimple(amg_data); + add_end = hypre_ParAMGDataAddLastLvl(amg_data); add_rlx = hypre_ParAMGDataAddRelaxType(amg_data); add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data); - A_block_array = hypre_ParAMGDataABlockArray(amg_data); P_block_array = hypre_ParAMGDataPBlockArray(amg_data); @@ -923,19 +924,40 @@ if (additive == 0 || mult_additive == 0 || simple == 0) { + HYPRE_Int add_lvl = add_end; + if (add_end == -1) add_lvl = num_levels-1; if (additive > -1) - hypre_printf( " Additive V-cycle starting at level %d \n", additive); + hypre_printf( " Additive V-cycle 1st level %d last level %d: \n", additive, add_lvl); if (mult_additive > -1) - hypre_printf( " Mult-Additive V-cycle starting at level %d \n", mult_additive); + hypre_printf( " Mult-Additive V-cycle 1st level %d last level %d: \n", mult_additive, add_lvl); if (simple > -1) - hypre_printf( " Simplified Mult-Additive V-cycle starting at level %d \n", simple); - hypre_printf( "\n"); + hypre_printf( " Simplified Mult-Additive V-cycle 1st level %d: last level %d \n", simple, add_lvl); hypre_printf( " Relaxation Parameters:\n"); - hypre_printf( " Visiting Grid: down up coarse\n"); - hypre_printf( " Number of sweeps: %4d %2d %4d \n", + if (add_lvl == num_levels-1) + { + hypre_printf( " Visiting Grid: down up coarse\n"); + hypre_printf( " Number of sweeps: %4d %2d %4d \n", num_grid_sweeps[1], - num_grid_sweeps[2],(2*num_grid_sweeps[3])); - hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %2d %2d %2d \n", add_rlx, add_rlx, add_rlx); + num_grid_sweeps[1],(2*num_grid_sweeps[1])); + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %2d %2d %2d \n", add_rlx, add_rlx, add_rlx); + } + else + { + hypre_printf( " Visiting Grid: down up\n"); + hypre_printf( " Number of sweeps: %4d %2d\n", + num_grid_sweeps[1], num_grid_sweeps[1]); + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %2d %2d\n", add_rlx, add_rlx); + } + if (add_lvl < num_levels -1) + { + hypre_printf( " \n"); + hypre_printf( "Multiplicative portion: \n"); + hypre_printf( " Visiting Grid: down up coarse\n"); + hypre_printf( " Number of sweeps: %4d %2d %4d\n", + num_grid_sweeps[1], num_grid_sweeps[2], num_grid_sweeps[3]); + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %4d %2d %4d\n", + grid_relax_type[1], grid_relax_type[2], grid_relax_type[3]); + } if (add_rlx == 0) hypre_printf( " Relaxation Weight: %e \n", add_rlx_wt); hypre_printf( " Point types, partial sweeps (1=C, -1=F):\n"); hypre_printf( " Pre-CG relaxation (down):"); @@ -953,12 +975,25 @@ } else if (additive > 0 || mult_additive > 0 || simple > 0) { + HYPRE_Int add_lvl = add_end; + if (add_end == -1) add_lvl = num_levels-1; hypre_printf( " Relaxation Parameters:\n"); - hypre_printf( " Visiting Grid: down up \n"); - hypre_printf( " Number of sweeps: %4d %2d \n", + if (add_lvl < num_levels -1) + { + hypre_printf( " Visiting Grid: down up coarse\n"); + hypre_printf( " Number of sweeps: %4d %2d %4d\n", + num_grid_sweeps[1], num_grid_sweeps[2], num_grid_sweeps[3]); + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %4d %2d %4d\n", + grid_relax_type[1], grid_relax_type[2], grid_relax_type[3]); + } + else + { + hypre_printf( " Visiting Grid: down up \n"); + hypre_printf( " Number of sweeps: %4d %2d \n", num_grid_sweeps[1], num_grid_sweeps[2]); - hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %4d %2d \n", + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %4d %2d \n", grid_relax_type[1], grid_relax_type[2]); + } hypre_printf( " Point types, partial sweeps (1=C, -1=F):\n"); if (grid_relax_points && grid_relax_type[1] != 8) { @@ -995,18 +1030,27 @@ } hypre_printf( "\n\n"); if (additive > -1) - hypre_printf( " Additive V-cycle starting at level %d \n", additive); + hypre_printf( " Additive V-cycle 1st level %d last level %d: \n", additive, add_lvl); if (mult_additive > -1) - hypre_printf( " Mult-Additive V-cycle starting at level %d \n", mult_additive); + hypre_printf( " Mult-Additive V-cycle 1st level %d last level %d: \n", mult_additive, add_lvl); if (simple > -1) - hypre_printf( " Simplified Mult-Additive V-cycle starting at level %d \n", simple); - hypre_printf( "\n"); + hypre_printf( " Simplified Mult-Additive V-cycle 1st level %d: last level %d \n", simple, add_lvl); hypre_printf( " Relaxation Parameters:\n"); - hypre_printf( " Visiting Grid: down up coarse\n"); - hypre_printf( " Number of sweeps: %4d %2d %4d \n", + if (add_lvl == num_levels-1) + { + hypre_printf( " Visiting Grid: down up coarse\n"); + hypre_printf( " Number of sweeps: %4d %2d %4d \n", num_grid_sweeps[1], - num_grid_sweeps[2],(2*num_grid_sweeps[3])); - hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %2d %2d %2d \n", add_rlx, add_rlx, add_rlx); + num_grid_sweeps[1],(2*num_grid_sweeps[1])); + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %2d %2d %2d \n", add_rlx, add_rlx, add_rlx); + } + else + { + hypre_printf( " Visiting Grid: down up\n"); + hypre_printf( " Number of sweeps: %4d %2d\n", + num_grid_sweeps[1], num_grid_sweeps[1]); + hypre_printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %2d %2d\n", add_rlx, add_rlx); + } if (add_rlx == 0) hypre_printf( " Relaxation Weight: %e \n", add_rlx_wt); hypre_printf( " Point types, partial sweeps (1=C, -1=F):\n"); hypre_printf( " Pre-CG relaxation (down):"); diff -Nru hypre-2.11.2/src/parcsr_ls/par_strength.c hypre-2.13.0/src/parcsr_ls/par_strength.c --- hypre-2.11.2/src/parcsr_ls/par_strength.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_strength.c 2017-10-20 17:42:22.000000000 +0000 @@ -533,6 +533,396 @@ return (ierr); } +/* + Create Strength matrix from CF marker array data. Provides a more + general form to build S for specific nodes of the 'global' matrix + (for example, F points or A_FF part), given the entire matrix. + These nodes have the SMRK tag. + + Currently assumes num_functions == 1, hence separate routine is used + for now. Could possibly be merged with BoomerAMGCreateS() to yield a + more general function. + */ +HYPRE_Int +hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix *A, + HYPRE_Real strength_threshold, + HYPRE_Real max_row_sum, + HYPRE_Int *CF_marker, + HYPRE_Int SMRK, + hypre_ParCSRMatrix **S_ptr) +{ +#ifdef HYPRE_PROFILE + hypre_profile_times[HYPRE_TIMER_ID_CREATES] -= hypre_MPI_Wtime(); +#endif + + MPI_Comm comm = hypre_ParCSRMatrixComm(A); + hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A); + hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); + HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag); + HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag); + + + hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A); + HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd); + HYPRE_Real *A_offd_data = NULL; + HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag); + HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd); + + HYPRE_Int *row_starts = hypre_ParCSRMatrixRowStarts(A); + HYPRE_Int num_variables = hypre_CSRMatrixNumRows(A_diag); + HYPRE_Int global_num_vars = hypre_ParCSRMatrixGlobalNumRows(A); + HYPRE_Int num_nonzeros_diag; + HYPRE_Int num_nonzeros_offd = 0; + HYPRE_Int num_cols_offd = 0; + + hypre_ParCSRMatrix *S; + hypre_CSRMatrix *S_diag; + HYPRE_Int *S_diag_i; + HYPRE_Int *S_diag_j; + /* HYPRE_Real *S_diag_data; */ + hypre_CSRMatrix *S_offd; + HYPRE_Int *S_offd_i = NULL; + HYPRE_Int *S_offd_j = NULL; + /* HYPRE_Real *S_offd_data; */ + + HYPRE_Real diag, row_scale, row_sum; + HYPRE_Int i, jj, jA, jS; + + HYPRE_Int ierr = 0; + + HYPRE_Int *prefix_sum_workspace; + + /*-------------------------------------------------------------- + * Compute a ParCSR strength matrix, S. + * + * For now, the "strength" of dependence/influence is defined in + * the following way: i depends on j if + * aij > hypre_max (k != i) aik, aii < 0 + * or + * aij < hypre_min (k != i) aik, aii >= 0 + * Then S_ij = 1, else S_ij = 0. + * + * NOTE: the entries are negative initially, corresponding + * to "unaccounted-for" dependence. + *----------------------------------------------------------------*/ + + num_nonzeros_diag = A_diag_i[num_variables]; + num_cols_offd = hypre_CSRMatrixNumCols(A_offd); + + A_offd_i = hypre_CSRMatrixI(A_offd); + num_nonzeros_offd = A_offd_i[num_variables]; + + S = hypre_ParCSRMatrixCreate(comm, global_num_vars, global_num_vars, + row_starts, row_starts, + num_cols_offd, num_nonzeros_diag, num_nonzeros_offd); +/* row_starts is owned by A, col_starts = row_starts */ + hypre_ParCSRMatrixSetRowStartsOwner(S,0); + S_diag = hypre_ParCSRMatrixDiag(S); + hypre_CSRMatrixI(S_diag) = hypre_CTAlloc(HYPRE_Int, num_variables+1); + hypre_CSRMatrixJ(S_diag) = hypre_CTAlloc(HYPRE_Int, num_nonzeros_diag); + S_offd = hypre_ParCSRMatrixOffd(S); + hypre_CSRMatrixI(S_offd) = hypre_CTAlloc(HYPRE_Int, num_variables+1); + + S_diag_i = hypre_CSRMatrixI(S_diag); + HYPRE_Int *S_temp_diag_j = hypre_CSRMatrixJ(S_diag); + S_offd_i = hypre_CSRMatrixI(S_offd); + + S_diag_j = hypre_CTAlloc(HYPRE_Int, num_nonzeros_diag); +/* +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < num_nonzeros_diag; i++) + S_diag_j[i] = 0; +*/ + + HYPRE_Int *S_temp_offd_j = NULL; + + if (num_cols_offd) + { + A_offd_data = hypre_CSRMatrixData(A_offd); + hypre_CSRMatrixJ(S_offd) = hypre_CTAlloc(HYPRE_Int, num_nonzeros_offd); + S_temp_offd_j = hypre_CSRMatrixJ(S_offd); + HYPRE_Int *col_map_offd_S = hypre_TAlloc(HYPRE_Int, num_cols_offd); + hypre_ParCSRMatrixColMapOffd(S) = col_map_offd_S; + + S_offd_j = hypre_CTAlloc(HYPRE_Int, num_nonzeros_offd); +/* +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < num_nonzeros_offd; i++) + S_offd_j[i] = 0; +*/ + + HYPRE_Int *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A); +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#endif + for (i = 0; i < num_cols_offd; i++) + col_map_offd_S[i] = col_map_offd_A[i]; + } + + + /*------------------------------------------------------------------- + * Get the dof_func data for the off-processor columns + *-------------------------------------------------------------------*/ + + if (!comm_pkg) + { + hypre_MatvecCommPkgCreate(A); + + comm_pkg = hypre_ParCSRMatrixCommPkg(A); + } + + /*HYPRE_Int prefix_sum_workspace[2*(hypre_NumThreads() + 1)];*/ + prefix_sum_workspace = hypre_TAlloc(HYPRE_Int, 2*(hypre_NumThreads() + 1)); + + /* give S same nonzero structure as A */ + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel private(i,diag,row_scale,row_sum,jA,jS) +#endif + { + HYPRE_Int start, stop; + hypre_GetSimpleThreadPartition(&start, &stop, num_variables); + HYPRE_Int jS_diag = 0, jS_offd = 0; + + for (i = start; i < stop; i++) + { + if (CF_marker[i] == SMRK) { + S_diag_i[i] = jS_diag; + if (num_cols_offd) + { + S_offd_i[i] = jS_offd; + } + + diag = A_diag_data[A_diag_i[i]]; + + /* compute scaling factor and row sum */ + row_scale = 0.0; + row_sum = diag; + if (diag < 0) + { + for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++) + { + jj = A_diag_j[jA]; + if (CF_marker[jj] == SMRK) { + row_scale = hypre_max(row_scale, A_diag_data[jA]); + row_sum += A_diag_data[jA]; + } + } + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + jj = A_offd_j[jA]; + if (CF_marker[jj] == SMRK) { + row_scale = hypre_max(row_scale, A_offd_data[jA]); + row_sum += A_offd_data[jA]; + } + } + } + else + { + for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++) + { + jj = A_diag_j[jA]; + if (CF_marker[jj] == SMRK) { + row_scale = hypre_min(row_scale, A_diag_data[jA]); + row_sum += A_diag_data[jA]; + } + } + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + jj = A_offd_j[jA]; + if (CF_marker[jj] == SMRK) { + row_scale = hypre_min(row_scale, A_offd_data[jA]); + row_sum += A_offd_data[jA]; + } + } + } /* diag >= 0*/ + + jS_diag += A_diag_i[i + 1] - A_diag_i[i] - 1; + jS_offd += A_offd_i[i + 1] - A_offd_i[i]; + + /* compute row entries of S */ + S_temp_diag_j[A_diag_i[i]] = -1; + if ((fabs(row_sum) > fabs(diag)*max_row_sum) && (max_row_sum < 1.0)) + { + /* make all dependencies weak */ + for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++) + { + jj = A_diag_j[jA]; + S_temp_diag_j[jA] = -1; + } + jS_diag -= A_diag_i[i + 1] - (A_diag_i[i] + 1); + + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + S_temp_offd_j[jA] = -1; + } + jS_offd -= A_offd_i[i + 1] - A_offd_i[i]; + } + else + { + if (diag < 0) + { + for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++) + { + jj = A_diag_j[jA]; + if (CF_marker[jj] == SMRK) { + if (A_diag_data[jA] <= strength_threshold * row_scale) + { + S_temp_diag_j[jA] = -1; + --jS_diag; + } + else + { + S_temp_diag_j[jA] = jj; + } + } else { + S_temp_diag_j[jA] = -1; + } + } + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + jj = A_offd_j[jA]; + if (CF_marker[jj] == SMRK) { + if (A_offd_data[jA] <= strength_threshold * row_scale) + { + S_temp_offd_j[jA] = -1; + --jS_offd; + } + else + { + S_temp_offd_j[jA] = jj; + } + } else { + S_temp_offd_j[jA] = -1; + } + } + } /* diag < 0 */ + else + { + for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++) + { + jj = A_diag_j[jA]; + if (CF_marker[jj] == SMRK) { + if (A_diag_data[jA] >= strength_threshold * row_scale) + { + S_temp_diag_j[jA] = -1; + --jS_diag; + } + else + { + S_temp_diag_j[jA] = jj; + } + } else { + S_temp_diag_j[jA] = -1; + } + } + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + jj = A_offd_j[jA]; + if (CF_marker[jj] == SMRK) { + if (A_offd_data[jA] >= strength_threshold * row_scale) + { + S_temp_offd_j[jA] = -1; + --jS_offd; + } + else + { + S_temp_offd_j[jA] = jj; + } + } else { + S_temp_offd_j[jA] = -1; + } + } + } /* diag >= 0 */ + } /* !((row_sum > max_row_sum) && (max_row_sum < 1.0)) */ + } /* CF_marker == SMRK */ + else + { + S_diag_i[i] = jS_diag; + if (num_cols_offd) + { + S_offd_i[i] = jS_offd; + } + jS_diag += A_diag_i[i + 1] - A_diag_i[i] - 1; + jS_offd += A_offd_i[i + 1] - A_offd_i[i]; + + for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++) + { + S_temp_diag_j[jA] = -1; + } + jS_diag -= A_diag_i[i + 1] - (A_diag_i[i] + 1); + + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + S_temp_offd_j[jA] = -1; + } + jS_offd -= A_offd_i[i + 1] - A_offd_i[i]; + } /* CF_marker != SMRK */ + } /* for each variable */ + + hypre_prefix_sum_pair(&jS_diag, S_diag_i + num_variables, &jS_offd, S_offd_i + num_variables, prefix_sum_workspace); + + /*-------------------------------------------------------------- + * "Compress" the strength matrix. + * + * NOTE: S has *NO DIAGONAL ELEMENT* on any row. Caveat Emptor! + * + * NOTE: This "compression" section of code may be removed, and + * coarsening will still be done correctly. However, the routine + * that builds interpolation would have to be modified first. + *----------------------------------------------------------------*/ + + for (i = start; i < stop; i++) + { + S_diag_i[i] += jS_diag; + S_offd_i[i] += jS_offd; + + jS = S_diag_i[i]; + for (jA = A_diag_i[i]; jA < A_diag_i[i+1]; jA++) + { + if (S_temp_diag_j[jA] > -1) + { + S_diag_j[jS] = S_temp_diag_j[jA]; + jS++; + } + } + + jS = S_offd_i[i]; + for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++) + { + if (S_temp_offd_j[jA] > -1) + { + S_offd_j[jS] = S_temp_offd_j[jA]; + jS++; + } + } + } /* for each variable */ + + } /* omp parallel */ + + hypre_CSRMatrixNumNonzeros(S_diag) = S_diag_i[num_variables]; + hypre_CSRMatrixNumNonzeros(S_offd) = S_offd_i[num_variables]; + hypre_CSRMatrixJ(S_diag) = S_diag_j; + hypre_CSRMatrixJ(S_offd) = S_offd_j; + + hypre_ParCSRMatrixCommPkg(S) = NULL; + + *S_ptr = S; + + hypre_TFree(prefix_sum_workspace); + hypre_TFree(S_temp_diag_j); + hypre_TFree(S_temp_offd_j); + +#ifdef HYPRE_PROFILE + hypre_profile_times[HYPRE_TIMER_ID_CREATES] += hypre_MPI_Wtime(); +#endif + + return (ierr); +} /*==========================================================================*/ /*==========================================================================*/ @@ -1497,6 +1887,7 @@ temp = hypre_UnorderedIntSetCopyToArray(&found_set, &num_cols_offd_C); + hypre_UnorderedIntSetDestroy(&found_set); hypre_TFree(S_ext_i); hypre_TFree(S_ext_j); diff -Nru hypre-2.11.2/src/parcsr_ls/par_sv_interp_lsfit.c hypre-2.13.0/src/parcsr_ls/par_sv_interp_lsfit.c --- hypre-2.11.2/src/parcsr_ls/par_sv_interp_lsfit.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_sv_interp_lsfit.c 2017-10-20 17:42:22.000000000 +0000 @@ -1,17 +1,7 @@ #include "_hypre_parcsr_ls.h" #include "Common.h" - - -#ifdef HYPRE_USING_ESSL -#include -#else -HYPRE_Int hypre_F90_NAME_BLAS(dgemm, DGEMM) (char *, char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_BLAS(dgemv, DGEMV) (char *, HYPRE_Int * , HYPRE_Int * , HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *); - -HYPRE_Int hypre_F90_NAME_LAPACK(dgetrf, DGETRF) (HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dgetrs, DGETRS) (char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *b, HYPRE_Int*, HYPRE_Int *); - -#endif +#include "_hypre_blas.h" +#include "_hypre_lapack.h" #define ADJUST(a,b) (adjust_list[(a)*(num_functions-1)+(b)]) @@ -414,36 +404,32 @@ /* now B_s <-delta*Beta*Beta^T + B_s */ /* usage: DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) C := alpha*op( A )*op( B ) + beta*C */ - hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "T", &num_smooth_vecs, - &num_smooth_vecs, &k_size, - &delta, Beta, &num_smooth_vecs, Beta, - &num_smooth_vecs, &one, B_s, &num_smooth_vecs); + hypre_dgemm("N", "T", &num_smooth_vecs, + &num_smooth_vecs, &k_size, + &delta, Beta, &num_smooth_vecs, Beta, + &num_smooth_vecs, &one, B_s, &num_smooth_vecs); /* now do alpha <- (alpha - beta*w)*/ /* usage: DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) y := alpha*A*x + beta*y */ - hypre_F90_NAME_BLAS(dgemv,DGEMV)("N", &num_smooth_vecs, &k_size, &mone, - Beta, &num_smooth_vecs, w_old, &one_i, - &one, alpha, &one_i); - + hypre_dgemv("N", &num_smooth_vecs, &k_size, &mone, + Beta, &num_smooth_vecs, w_old, &one_i, + &one, alpha, &one_i); /* now get alpha <- inv(B_s)*alpha */ - /*write over B_s with LU */ - hypre_F90_NAME_LAPACK(dgetrf, DGETRF)(&num_smooth_vecs, &num_smooth_vecs, - B_s, &num_smooth_vecs, piv, &info); - - /*now get alpha */ - hypre_F90_NAME_LAPACK(dgetrs, DGETRS)("N", &num_smooth_vecs, &one_i, B_s, - &num_smooth_vecs, piv, alpha, - &num_smooth_vecs, &info); + /*write over B_s with LU */ + hypre_dgetrf(&num_smooth_vecs, &num_smooth_vecs, + B_s, &num_smooth_vecs, piv, &info); - + /*now get alpha */ + hypre_dgetrs("N", &num_smooth_vecs, &one_i, B_s, + &num_smooth_vecs, piv, alpha, + &num_smooth_vecs, &info); /* now w <- w + (delta)*(Beta)^T*(alpha) */ - hypre_F90_NAME_BLAS(dgemv,DGEMV)("T", &num_smooth_vecs, &k_size, &delta, - Beta, &num_smooth_vecs, alpha, &one_i, - &one, w, &one_i); - + hypre_dgemv("T", &num_smooth_vecs, &k_size, &delta, + Beta, &num_smooth_vecs, alpha, &one_i, + &one, w, &one_i); /* note:we have w_old still, but we don't need it unless we * want to use it in the future for something */ diff -Nru hypre-2.11.2/src/parcsr_ls/par_vardifconv.c hypre-2.13.0/src/parcsr_ls/par_vardifconv.c --- hypre-2.11.2/src/parcsr_ls/par_vardifconv.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_vardifconv.c 2017-10-20 17:42:22.000000000 +0000 @@ -351,10 +351,6 @@ } } - par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part); - rhs = hypre_ParVectorLocalVector(par_rhs); - hypre_VectorData(rhs) = rhs_data; - #ifdef HYPRE_NO_GLOBAL_PARTITION /* ideally we would use less storage earlier in this function, but this is fine for testing */ @@ -369,6 +365,11 @@ } #endif + par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part); + hypre_ParVectorOwnsPartitioning(par_rhs) = 0; + rhs = hypre_ParVectorLocalVector(par_rhs); + hypre_VectorData(rhs) = rhs_data; + A = hypre_ParCSRMatrixCreate(comm, grid_size, grid_size, global_part, global_part, num_cols_offd, diag_i[local_num_rows], diff -Nru hypre-2.11.2/src/parcsr_ls/par_vardifconv_rs.c hypre-2.13.0/src/parcsr_ls/par_vardifconv_rs.c --- hypre-2.11.2/src/parcsr_ls/par_vardifconv_rs.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/par_vardifconv_rs.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,559 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#include "_hypre_parcsr_ls.h" + +#ifndef M_PI +#define M_PI 3.14159265358979 +#endif + +/* examples in Ruge & Stuben paper */ +static HYPRE_Int rs_example = 1; +static HYPRE_Real rs_l = 3.0; + +/*-------------------------------------------------------------------------- + * hypre_GenerateVarDifConv: with the FD discretization and examples + * in Ruge-Stuben's paper ``Algebraic Multigrid'' + *--------------------------------------------------------------------------*/ + +HYPRE_ParCSRMatrix +GenerateRSVarDifConv( MPI_Comm comm, + HYPRE_Int nx, + HYPRE_Int ny, + HYPRE_Int nz, + HYPRE_Int P, + HYPRE_Int Q, + HYPRE_Int R, + HYPRE_Int p, + HYPRE_Int q, + HYPRE_Int r, + HYPRE_Real eps, + HYPRE_ParVector *rhs_ptr, + HYPRE_Int type) +{ + hypre_ParCSRMatrix *A; + hypre_CSRMatrix *diag; + hypre_CSRMatrix *offd; + hypre_ParVector *par_rhs; + hypre_Vector *rhs; + HYPRE_Real *rhs_data; + + HYPRE_Int *diag_i; + HYPRE_Int *diag_j; + HYPRE_Real *diag_data; + + HYPRE_Int *offd_i; + HYPRE_Int *offd_j; + HYPRE_Real *offd_data; + + HYPRE_Int *global_part; + HYPRE_Int ix, iy, iz; + HYPRE_Int cnt, o_cnt; + HYPRE_Int local_num_rows; + HYPRE_Int *col_map_offd; + HYPRE_Int row_index; + HYPRE_Int i,j; + + HYPRE_Int nx_local, ny_local, nz_local; + HYPRE_Int nx_size, ny_size, nz_size; + HYPRE_Int num_cols_offd; + HYPRE_Int grid_size; + + HYPRE_Int *nx_part; + HYPRE_Int *ny_part; + HYPRE_Int *nz_part; + + HYPRE_Int num_procs, my_id; + HYPRE_Int P_busy, Q_busy, R_busy; + + HYPRE_Real hhx, hhy, hhz; + HYPRE_Real xx, yy, zz; + HYPRE_Real afp, afm, bfp, bfm, cfp, cfm, di, ai, mux, ei, bi, + muy, fi, ci, muz, dfm, dfp, efm, efp, ffm, ffp, gi; + + hypre_MPI_Comm_size(comm,&num_procs); + hypre_MPI_Comm_rank(comm,&my_id); + + if (type >= 1 && type <= 3) + { + rs_example = type; + } + + grid_size = nx*ny*nz; + + hypre_GeneratePartitioning(nx,P,&nx_part); + hypre_GeneratePartitioning(ny,Q,&ny_part); + hypre_GeneratePartitioning(nz,R,&nz_part); + + global_part = hypre_CTAlloc(HYPRE_Int,P*Q*R+1); + + global_part[0] = 0; + cnt = 1; + for (iz = 0; iz < R; iz++) + { + nz_size = nz_part[iz+1]-nz_part[iz]; + for (iy = 0; iy < Q; iy++) + { + ny_size = ny_part[iy+1]-ny_part[iy]; + for (ix = 0; ix < P; ix++) + { + nx_size = nx_part[ix+1] - nx_part[ix]; + global_part[cnt] = global_part[cnt-1]; + global_part[cnt++] += nx_size*ny_size*nz_size; + } + } + } + + nx_local = nx_part[p+1] - nx_part[p]; + ny_local = ny_part[q+1] - ny_part[q]; + nz_local = nz_part[r+1] - nz_part[r]; + + my_id = r*(P*Q) + q*P + p; + num_procs = P*Q*R; + + local_num_rows = nx_local*ny_local*nz_local; + diag_i = hypre_CTAlloc(HYPRE_Int, local_num_rows+1); + offd_i = hypre_CTAlloc(HYPRE_Int, local_num_rows+1); + rhs_data = hypre_CTAlloc(HYPRE_Real, local_num_rows); + + P_busy = hypre_min(nx,P); + Q_busy = hypre_min(ny,Q); + R_busy = hypre_min(nz,R); + + num_cols_offd = 0; + if (p) num_cols_offd += ny_local*nz_local; + if (p < P_busy-1) num_cols_offd += ny_local*nz_local; + if (q) num_cols_offd += nx_local*nz_local; + if (q < Q_busy-1) num_cols_offd += nx_local*nz_local; + if (r) num_cols_offd += nx_local*ny_local; + if (r < R_busy-1) num_cols_offd += nx_local*ny_local; + + if (!local_num_rows) num_cols_offd = 0; + + col_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd); + + hhx = 1.0/(HYPRE_Real)(nx+1); + hhy = 1.0/(HYPRE_Real)(ny+1); + hhz = 1.0/(HYPRE_Real)(nz+1); + + cnt = 1; + o_cnt = 1; + diag_i[0] = 0; + offd_i[0] = 0; + for (iz = nz_part[r]; iz < nz_part[r+1]; iz++) + { + for (iy = ny_part[q]; iy < ny_part[q+1]; iy++) + { + for (ix = nx_part[p]; ix < nx_part[p+1]; ix++) + { + diag_i[cnt] = diag_i[cnt-1]; + offd_i[o_cnt] = offd_i[o_cnt-1]; + diag_i[cnt]++; + if (iz > nz_part[r]) + diag_i[cnt]++; + else + { + if (iz) + { + offd_i[o_cnt]++; + } + } + if (iy > ny_part[q]) + diag_i[cnt]++; + else + { + if (iy) + { + offd_i[o_cnt]++; + } + } + if (ix > nx_part[p]) + diag_i[cnt]++; + else + { + if (ix) + { + offd_i[o_cnt]++; + } + } + if (ix+1 < nx_part[p+1]) + diag_i[cnt]++; + else + { + if (ix+1 < nx) + { + offd_i[o_cnt]++; + } + } + if (iy+1 < ny_part[q+1]) + diag_i[cnt]++; + else + { + if (iy+1 < ny) + { + offd_i[o_cnt]++; + } + } + if (iz+1 < nz_part[r+1]) + diag_i[cnt]++; + else + { + if (iz+1 < nz) + { + offd_i[o_cnt]++; + } + } + cnt++; + o_cnt++; + } + } + } + + diag_j = hypre_CTAlloc(HYPRE_Int, diag_i[local_num_rows]); + diag_data = hypre_CTAlloc(HYPRE_Real, diag_i[local_num_rows]); + + if (num_procs > 1) + { + offd_j = hypre_CTAlloc(HYPRE_Int, offd_i[local_num_rows]); + offd_data = hypre_CTAlloc(HYPRE_Real, offd_i[local_num_rows]); + } + + row_index = 0; + cnt = 0; + o_cnt = 0; + for (iz = nz_part[r]; iz < nz_part[r+1]; iz++) + { + zz = (HYPRE_Real)(iz+1)*hhz; + for (iy = ny_part[q]; iy < ny_part[q+1]; iy++) + { + yy = (HYPRE_Real)(iy+1)*hhy; + for (ix = nx_part[p]; ix < nx_part[p+1]; ix++) + { + xx = (HYPRE_Real)(ix+1)*hhx; + afp = -eps*afun_rs(xx+0.5*hhx,yy,zz)/hhx/hhx; + afm = -eps*afun_rs(xx-0.5*hhx,yy,zz)/hhx/hhx; + bfp = -eps*bfun_rs(xx,yy+0.5*hhy,zz)/hhy/hhy; + bfm = -eps*bfun_rs(xx,yy-0.5*hhy,zz)/hhy/hhy; + cfp = -eps*cfun_rs(xx,yy,zz+0.5*hhz)/hhz/hhz; + cfm = -eps*cfun_rs(xx,yy,zz-0.5*hhz)/hhz/hhz; + /* first order terms */ + /* x-direction */ + di = dfun_rs(xx, yy, zz); + ai = afun_rs(xx, yy, zz); + if (di * hhx > eps * ai) + { + mux = eps * ai / (2.0 * di * hhx); + } + else if (di * hhx < -eps * ai) + { + mux = 1.0 + eps * ai / (2.0 * di * hhx); + } + else + { + mux = 0.5; + } + /* y-direction */ + ei = efun_rs(xx, yy, zz); + bi = bfun_rs(xx, yy, zz); + if (ei * hhy > eps * bi) + { + muy = eps * bi / (2.0 * ei * hhy); + } + else if (ei * hhy < -eps * bi) + { + muy = 1.0 + eps * bi / (2.0 * ei * hhy); + } + else + { + muy = 0.5; + } + /* z-direction */ + fi = ffun_rs(xx, yy, zz); + ci = cfun_rs(xx, yy, zz); + if (fi * hhz > eps * ci) + { + muz = eps * ci / (2.0 * fi * hhz); + } + else if (fi * hhz < -eps * ci) + { + muz = 1.0 + eps * ci / (2.0 * fi * hhz); + } + else + { + muz = 0.5; + } + + dfm = di * (mux - 1.0) / hhx; + dfp = di * mux / hhx; + efm = ei * (muy - 1.0) / hhy; + efp = ei * muy / hhy; + ffm = fi * (muz - 1.0) / hhz; + ffp = fi * muz / hhz; + gi = gfun_rs(xx, yy, zz); + /* stencil: center */ + diag_j[cnt] = row_index; + diag_data[cnt++] = -(afp + afm + bfp + bfm + cfp + cfm + + dfp + dfm + efp + efm + ffp + ffm) + gi; + /* rhs vector */ + rhs_data[row_index] = rfun_rs(xx,yy,zz); + /* apply boundary conditions */ + if (ix == 0) rhs_data[row_index] -= (afm+dfm) * bndfun_rs(0,yy,zz); + if (iy == 0) rhs_data[row_index] -= (bfm+efm) * bndfun_rs(xx,0,zz); + if (iz == 0) rhs_data[row_index] -= (cfm+ffm) * bndfun_rs(xx,yy,0); + if (ix+1 == nx) rhs_data[row_index] -= (afp+dfp) * bndfun_rs(1.0,yy,zz); + if (iy+1 == ny) rhs_data[row_index] -= (bfp+efp) * bndfun_rs(xx,1.0,zz); + if (iz+1 == nz) rhs_data[row_index] -= (cfp+ffp) * bndfun_rs(xx,yy,1.0); + /* stencil: z- */ + if (iz > nz_part[r]) + { + diag_j[cnt] = row_index - nx_local*ny_local; + diag_data[cnt++] = cfm + ffm; + } + else + { + if (iz) + { + offd_j[o_cnt] = hypre_map(ix,iy,iz-1,p,q,r-1,P,Q,R, + nx_part,ny_part,nz_part,global_part); + offd_data[o_cnt++] = cfm + ffm; + } + } + /* stencil: y- */ + if (iy > ny_part[q]) + { + diag_j[cnt] = row_index - nx_local; + diag_data[cnt++] = bfm + efm; + } + else + { + if (iy) + { + offd_j[o_cnt] = hypre_map(ix,iy-1,iz,p,q-1,r,P,Q,R, + nx_part,ny_part,nz_part,global_part); + offd_data[o_cnt++] = bfm + efm; + } + } + /* stencil: x- */ + if (ix > nx_part[p]) + { + diag_j[cnt] = row_index - 1; + diag_data[cnt++] = afm + dfm; + } + else + { + if (ix) + { + offd_j[o_cnt] = hypre_map(ix-1,iy,iz,p-1,q,r,P,Q,R, + nx_part,ny_part,nz_part,global_part); + offd_data[o_cnt++] = afm + dfm; + } + } + /* stencil: x+ */ + if (ix+1 < nx_part[p+1]) + { + diag_j[cnt] = row_index + 1; + diag_data[cnt++] = afp + dfp; + } + else + { + if (ix+1 < nx) + { + offd_j[o_cnt] = hypre_map(ix+1,iy,iz,p+1,q,r,P,Q,R, + nx_part,ny_part,nz_part,global_part); + offd_data[o_cnt++] = afp + dfp; + } + } + /* stencil: y+ */ + if (iy+1 < ny_part[q+1]) + { + diag_j[cnt] = row_index + nx_local; + diag_data[cnt++] = bfp + efp; + } + else + { + if (iy+1 < ny) + { + offd_j[o_cnt] = hypre_map(ix,iy+1,iz,p,q+1,r,P,Q,R, + nx_part,ny_part,nz_part,global_part); + offd_data[o_cnt++] = bfp + efp; + } + } + /* stencil: z+ */ + if (iz+1 < nz_part[r+1]) + { + diag_j[cnt] = row_index + nx_local*ny_local; + diag_data[cnt++] = cfp + ffp; + } + else + { + if (iz+1 < nz) + { + offd_j[o_cnt] = hypre_map(ix,iy,iz+1,p,q,r+1,P,Q,R, + nx_part,ny_part,nz_part,global_part); + offd_data[o_cnt++] = cfp + ffp; + } + } + /* done with this row */ + row_index++; + } + } + } + + if (num_procs > 1) + { + for (i=0; i < num_cols_offd; i++) + col_map_offd[i] = offd_j[i]; + + hypre_qsort0(col_map_offd, 0, num_cols_offd-1); + + for (i=0; i < num_cols_offd; i++) + for (j=0; j < num_cols_offd; j++) + if (offd_j[i] == col_map_offd[j]) + { + offd_j[i] = j; + break; + } + } + +#ifdef HYPRE_NO_GLOBAL_PARTITION +/* ideally we would use less storage earlier in this function, but this is fine + for testing */ + { + HYPRE_Int tmp1, tmp2; + tmp1 = global_part[my_id]; + tmp2 = global_part[my_id + 1]; + hypre_TFree(global_part); + global_part = hypre_CTAlloc(HYPRE_Int, 2); + global_part[0] = tmp1; + global_part[1] = tmp2; + } +#endif + + par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part); + hypre_ParVectorOwnsPartitioning(par_rhs) = 0; + rhs = hypre_ParVectorLocalVector(par_rhs); + hypre_VectorData(rhs) = rhs_data; + + A = hypre_ParCSRMatrixCreate(comm, grid_size, grid_size, + global_part, global_part, num_cols_offd, + diag_i[local_num_rows], + offd_i[local_num_rows]); + + hypre_ParCSRMatrixColMapOffd(A) = col_map_offd; + + diag = hypre_ParCSRMatrixDiag(A); + hypre_CSRMatrixI(diag) = diag_i; + hypre_CSRMatrixJ(diag) = diag_j; + hypre_CSRMatrixData(diag) = diag_data; + + offd = hypre_ParCSRMatrixOffd(A); + hypre_CSRMatrixI(offd) = offd_i; + if (num_cols_offd) + { + hypre_CSRMatrixJ(offd) = offd_j; + hypre_CSRMatrixData(offd) = offd_data; + } + + hypre_TFree(nx_part); + hypre_TFree(ny_part); + hypre_TFree(nz_part); + + *rhs_ptr = (HYPRE_ParVector) par_rhs; + + return (HYPRE_ParCSRMatrix) A; +} + +HYPRE_Real afun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = 1.0; + return value; +} + +HYPRE_Real bfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = 1.0; + return value; +} + +HYPRE_Real cfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = 1.0; + return value; +} + +HYPRE_Real dfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + if (rs_example == 1) + { + value = sin(rs_l*M_PI/8.0); + } + else if (rs_example == 2) + { + value = (2.0*yy-1.0)*(1.0-xx*xx); + } + else + { + value = 4.0*xx*(xx-1.0)*(1.0-2.0*yy); + } + return value; +} + +HYPRE_Real efun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + if (rs_example == 1) + { + value = cos(rs_l*M_PI/8.0); + } + else if (rs_example == 2) + { + value = 2.0*xx*yy*(yy-1.0); + } + else + { + value = -4.0*yy*(yy-1.0)*(1.0-2.0*xx); + } + return value; +} + +HYPRE_Real ffun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = efun_rs(xx, yy, zz); + return value; +} + +HYPRE_Real gfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = 0.0; + return value; +} + +HYPRE_Real rfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = 1.0; + return value; +} + +HYPRE_Real bndfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz) +{ + HYPRE_Real value; + value = 0.0; + return value; +} + diff -Nru hypre-2.11.2/src/parcsr_ls/schwarz.c hypre-2.13.0/src/parcsr_ls/schwarz.c --- hypre-2.11.2/src/parcsr_ls/schwarz.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_ls/schwarz.c 2017-10-20 17:42:22.000000000 +0000 @@ -12,562 +12,472 @@ #include "_hypre_parcsr_ls.h" #include "Common.h" - +#include "_hypre_lapack.h" /* ------------------------------------------------------------------------- dof_domain: for each dof defines neighborhood to build interpolation, - using - domain_diagmat (for cut--off scaling) and - i_domain_dof, j_dof_domain (for extracting the block of A); + using + domain_diagmat (for cut--off scaling) and + i_domain_dof, j_dof_domain (for extracting the block of A); domain_matrixinverse: contains the inverse of subdomain matrix; B can be used to define strength matrix; ----------------------------------------------------------------------- */ -#ifdef HYPRE_USING_ESSL -#include -#else -#ifdef __cplusplus -extern "C" { -#endif -HYPRE_Int hypre_F90_NAME_LAPACK(dpotrf, DPOTRF)(char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *); - -HYPRE_Int hypre_F90_NAME_LAPACK(dgetrf, DGETRF) (HYPRE_Int *, HYPRE_Int *n, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *); -HYPRE_Int hypre_F90_NAME_LAPACK(dgetrs, DGETRS) (char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *b, HYPRE_Int*, HYPRE_Int *); -#ifdef __cplusplus -} -#endif -#endif - /*-------------------------------------------------------------------------- * hypre_AMGNodalSchwarzSmoother: (Not used currently) *--------------------------------------------------------------------------*/ - - HYPRE_Int hypre_AMGNodalSchwarzSmoother( hypre_CSRMatrix *A, - HYPRE_Int num_functions, - HYPRE_Int option, - hypre_CSRMatrix **domain_structure_pointer) - + HYPRE_Int num_functions, + HYPRE_Int option, + hypre_CSRMatrix **domain_structure_pointer) { + /* option = 0: nodal symGS; + 1: next to nodal symGS (overlapping Schwarz) */ - /* option = 0: nodal symGS; - 1: next to nodal symGS (overlapping Schwarz) */ - - - HYPRE_Int *i_domain_dof, *j_domain_dof; - HYPRE_Real *domain_matrixinverse; - HYPRE_Int num_domains; - hypre_CSRMatrix *domain_structure; - - HYPRE_Int *i_dof_node, *j_dof_node; - HYPRE_Int *i_node_dof, *j_node_dof; - - HYPRE_Int *i_node_dof_dof, *j_node_dof_dof; - - HYPRE_Int *i_node_node, *j_node_node; - - HYPRE_Int num_nodes; - - HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A); - HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A); - HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A); - HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A); - - - HYPRE_Int ierr = 0; - HYPRE_Int i,j,k, l_loc, i_loc, j_loc; - HYPRE_Int i_dof, j_dof; - HYPRE_Int *i_local_to_global; - HYPRE_Int *i_global_to_local; + HYPRE_Int *i_domain_dof, *j_domain_dof; + HYPRE_Real *domain_matrixinverse; + HYPRE_Int num_domains; + hypre_CSRMatrix *domain_structure; - HYPRE_Int *i_int; - HYPRE_Int *i_int_to_local; + HYPRE_Int *i_dof_node, *j_dof_node; + HYPRE_Int *i_node_dof, *j_node_dof; - HYPRE_Int int_dof_counter, local_dof_counter, max_local_dof_counter=0; + HYPRE_Int *i_node_dof_dof, *j_node_dof_dof; - HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0; + HYPRE_Int *i_node_node, *j_node_node; + HYPRE_Int num_nodes; - HYPRE_Real *AE; + HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A); + HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A); + HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A); + HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A); -#ifdef HYPRE_USING_ESSL -#else - char uplo = 'L'; -#endif - HYPRE_Int cnt; + HYPRE_Int ierr = 0; + HYPRE_Int i,j,k, l_loc, i_loc, j_loc; + HYPRE_Int i_dof, j_dof; + HYPRE_Int *i_local_to_global; + HYPRE_Int *i_global_to_local; - /* PCG arrays: --------------------------------------------------- - HYPRE_Real *x, *rhs, *v, *w, *d, *aux; + HYPRE_Int *i_int; + HYPRE_Int *i_int_to_local; - HYPRE_Int max_iter; + HYPRE_Int int_dof_counter, local_dof_counter, max_local_dof_counter=0; - ------------------------------------------------------------------ */ + HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0; + HYPRE_Real *AE; + char uplo = 'L'; + HYPRE_Int cnt; - /* build dof_node graph: ----------------------------------------- */ + /* PCG arrays: --------------------------------------------------- + HYPRE_Real *x, *rhs, *v, *w, *d, *aux; - num_nodes = num_dofs / num_functions; + HYPRE_Int max_iter; - /*hypre_printf("\nnum_nodes: %d, num_dofs: %d = %d x %d\n", num_nodes, num_dofs, - num_nodes, num_functions);*/ + ------------------------------------------------------------------ */ - i_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs+1); - j_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs); + /* build dof_node graph: ----------------------------------------- */ - for (i=0; i < num_dofs+1; i++) - i_dof_node[i] = i; + num_nodes = num_dofs / num_functions; - for (j = 0; j < num_nodes; j++) - for (k = 0; k < num_functions; k++) - j_dof_node[j*num_functions+k] = j; + /*hypre_printf("\nnum_nodes: %d, num_dofs: %d = %d x %d\n", num_nodes, num_dofs, + num_nodes, num_functions);*/ - /* build node_dof graph: ----------------------------------------- */ + i_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs+1); + j_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs); - ierr = transpose_matrix_create(&i_node_dof, &j_node_dof, - i_dof_node, j_dof_node, - - num_dofs, num_nodes); + for (i=0; i < num_dofs+1; i++) + i_dof_node[i] = i; + for (j = 0; j < num_nodes; j++) + for (k = 0; k < num_functions; k++) + j_dof_node[j*num_functions+k] = j; - /* build node_node graph: ----------------------------------------- */ + /* build node_dof graph: ----------------------------------------- */ - ierr = matrix_matrix_product(&i_node_dof_dof, - &j_node_dof_dof, + ierr = transpose_matrix_create(&i_node_dof, &j_node_dof, + i_dof_node, j_dof_node, + num_dofs, num_nodes); - i_node_dof, j_node_dof, - i_dof_dof, j_dof_dof, - - num_nodes, num_dofs, num_dofs); + /* build node_node graph: ----------------------------------------- */ - ierr = matrix_matrix_product(&i_node_node, - &j_node_node, + ierr = matrix_matrix_product(&i_node_dof_dof, + &j_node_dof_dof, + i_node_dof, j_node_dof, + i_dof_dof, j_dof_dof, + num_nodes, num_dofs, num_dofs); - i_node_dof_dof, - j_node_dof_dof, + ierr = matrix_matrix_product(&i_node_node, + &j_node_node, + i_node_dof_dof, + j_node_dof_dof, + i_dof_node, j_dof_node, + num_nodes, num_dofs, num_nodes); - i_dof_node, j_dof_node, - - num_nodes, num_dofs, num_nodes); + hypre_TFree(i_node_dof_dof); + hypre_TFree(j_node_dof_dof); - hypre_TFree(i_node_dof_dof); - hypre_TFree(j_node_dof_dof); + /* compute for each node the local information: -------------------- */ + i_global_to_local = i_dof_node; + for (i_dof =0; i_dof < num_dofs; i_dof++) + i_global_to_local[i_dof] = -1; - /* compute for each node the local information: -------------------- */ - - i_global_to_local = i_dof_node; - - for (i_dof =0; i_dof < num_dofs; i_dof++) - i_global_to_local[i_dof] = -1; - - domain_matrixinverse_counter = 0; - domain_dof_counter = 0; - for (i=0; i < num_nodes; i++) - { + domain_matrixinverse_counter = 0; + domain_dof_counter = 0; + for (i=0; i < num_nodes; i++) + { local_dof_counter = 0; for (j=i_node_node[i]; j < i_node_node[i+1]; j++) - for (k=i_node_dof[j_node_node[j]]; - k max_local_dof_counter) - max_local_dof_counter = local_dof_counter; + max_local_dof_counter = local_dof_counter; for (j=i_node_node[i]; j < i_node_node[i+1]; j++) - for (k=i_node_dof[j_node_node[j]]; - k=0) - AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j]; - } - } - - - /* get block for Schwarz smoother: ============================= */ - /* ierr = hypre_matinv(XE, AE, local_dof_counter); */ - /* hypre_printf("ierr_AE_inv: %d\n", ierr); */ -#ifdef HYPRE_USING_ESSL - cnt = local_dof_counter; - for (j_loc=1; j_loc < local_dof_counter; j_loc++) - for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++) - AE[cnt++] = AE[i_loc + j_loc * local_dof_counter]; - ierr = dppf(AE, local_dof_counter, 1); - if (ierr == 1) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Error! Matrix not SPD\n"); -#else - hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo, &local_dof_counter, AE, - &local_dof_counter, &ierr); - if (ierr == 1) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Error! Matrix not SPD\n"); -#endif - - for (i_loc=0; i_loc < local_dof_counter; i_loc++) - j_domain_dof[domain_dof_counter+i_loc] - = i_local_to_global[i_loc]; - } + { + AE = &domain_matrixinverse[domain_matrixinverse_counter]; + cnt = 0; + for (i_loc=0; i_loc < local_dof_counter; i_loc++) + for (j_loc=0; j_loc < local_dof_counter; j_loc++) + AE[cnt++] = 0.e0; + + for (i_loc=0; i_loc < local_dof_counter; i_loc++) + { + i_dof = i_local_to_global[i_loc]; + for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++) + { + j_loc = i_global_to_local[j_dof_dof[j]]; + if (j_loc >=0) + AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j]; + } + } + /* get block for Schwarz smoother: ============================= */ + /* ierr = hypre_matinv(XE, AE, local_dof_counter); */ + /* hypre_printf("ierr_AE_inv: %d\n", ierr); */ + hypre_dpotrf(&uplo, &local_dof_counter, AE, + &local_dof_counter, &ierr); + if (ierr == 1) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Error! Matrix not SPD\n"); + + for (i_loc=0; i_loc < local_dof_counter; i_loc++) + j_domain_dof[domain_dof_counter+i_loc] + = i_local_to_global[i_loc]; + } if (option == 0) - { + { + AE = &domain_matrixinverse[domain_matrixinverse_counter]; + for (i_loc=0; i_loc < int_dof_counter; i_loc++) + for (j_loc=0; j_loc < int_dof_counter; j_loc++) + AE[i_loc + j_loc * int_dof_counter] = 0.e0; - AE = &domain_matrixinverse[domain_matrixinverse_counter]; - for (i_loc=0; i_loc < int_dof_counter; i_loc++) - for (j_loc=0; j_loc < int_dof_counter; j_loc++) - AE[i_loc + j_loc * int_dof_counter] = 0.e0; - - - - for (l_loc=0; l_loc < int_dof_counter; l_loc++) - { - i_loc = i_int_to_local[l_loc]; - i_dof = i_local_to_global[i_loc]; - for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++) - { - j_loc = i_global_to_local[j_dof_dof[j]]; - if (j_loc >=0) - if (i_int[j_loc] >=0) - AE[i_loc + i_int[j_loc] * int_dof_counter] - = a_dof_dof[j]; - } - } - - /* ierr = hypre_matinv(XE, AE, int_dof_counter); */ -#ifdef HYPRE_USING_ESSL - cnt = local_dof_counter; - for (j_loc=1; j_loc < local_dof_counter; j_loc++) - for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++) - AE[cnt++] = AE[i_loc + j_loc * local_dof_counter]; - dppf(AE, local_dof_counter, 1); -#else - hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo, &local_dof_counter, AE, - &local_dof_counter, &ierr); -/* dpotrf_(&uplo, &local_dof_counter, AE, &local_dof_counter, &ierr);*/ -#endif - - if (ierr) hypre_error_w_msg(HYPRE_ERROR_GENERIC," error in dpotrf !!!\n"); - - for (i_loc=0; i_loc < int_dof_counter; i_loc++) - { - j_domain_dof[domain_dof_counter + i_loc] = - i_local_to_global[i_int_to_local[i_loc]]; - - for (j_loc=0; j_loc < int_dof_counter; j_loc++) - domain_matrixinverse[domain_matrixinverse_counter - + i_loc + j_loc * int_dof_counter] - = AE[i_loc + j_loc * int_dof_counter]; - } - - domain_dof_counter+=int_dof_counter; - domain_matrixinverse_counter+=int_dof_counter*int_dof_counter; - } - else - { - domain_dof_counter+=local_dof_counter; -#ifdef HYPRE_USING_ESSL - domain_matrixinverse_counter+=local_dof_counter*(local_dof_counter+1)/2; -#else - domain_matrixinverse_counter+=local_dof_counter*local_dof_counter; -#endif - } + for (l_loc=0; l_loc < int_dof_counter; l_loc++) + { + i_loc = i_int_to_local[l_loc]; + i_dof = i_local_to_global[i_loc]; + for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++) + { + j_loc = i_global_to_local[j_dof_dof[j]]; + if (j_loc >=0) + if (i_int[j_loc] >=0) + AE[i_loc + i_int[j_loc] * int_dof_counter] + = a_dof_dof[j]; + } + } + /* ierr = hypre_matinv(XE, AE, int_dof_counter); */ + hypre_dpotrf(&uplo, &local_dof_counter, AE, + &local_dof_counter, &ierr); - for (l_loc=0; l_loc < local_dof_counter; l_loc++) - { - i_int[l_loc] = -1; - i_global_to_local[i_local_to_global[l_loc]] = -1; - } + if (ierr) hypre_error_w_msg(HYPRE_ERROR_GENERIC," error in dpotrf !!!\n"); + for (i_loc=0; i_loc < int_dof_counter; i_loc++) + { + j_domain_dof[domain_dof_counter + i_loc] = + i_local_to_global[i_int_to_local[i_loc]]; - } + for (j_loc=0; j_loc < int_dof_counter; j_loc++) + domain_matrixinverse[domain_matrixinverse_counter + + i_loc + j_loc * int_dof_counter] + = AE[i_loc + j_loc * int_dof_counter]; + } - i_domain_dof[num_nodes] = domain_dof_counter; + domain_dof_counter+=int_dof_counter; + domain_matrixinverse_counter+=int_dof_counter*int_dof_counter; + } + else + { + domain_dof_counter+=local_dof_counter; + domain_matrixinverse_counter+=local_dof_counter*local_dof_counter; + } + for (l_loc=0; l_loc < local_dof_counter; l_loc++) + { + i_int[l_loc] = -1; + i_global_to_local[i_local_to_global[l_loc]] = -1; + } + } - hypre_TFree(i_dof_node); - hypre_TFree(j_dof_node); + i_domain_dof[num_nodes] = domain_dof_counter; - hypre_TFree(i_node_dof); - hypre_TFree(j_node_dof); - hypre_TFree(i_node_node); - hypre_TFree(j_node_node); + hypre_TFree(i_dof_node); + hypre_TFree(j_dof_node); - hypre_TFree(i_int); - hypre_TFree(i_int_to_local); + hypre_TFree(i_node_dof); + hypre_TFree(j_node_dof); + hypre_TFree(i_node_node); + hypre_TFree(j_node_node); - hypre_TFree(i_local_to_global); + hypre_TFree(i_int); + hypre_TFree(i_int_to_local); - domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, - i_domain_dof[num_domains]); - hypre_CSRMatrixI(domain_structure) = i_domain_dof; - hypre_CSRMatrixJ(domain_structure) = j_domain_dof; - hypre_CSRMatrixData(domain_structure) = domain_matrixinverse; + hypre_TFree(i_local_to_global); - *domain_structure_pointer = domain_structure; + domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, + i_domain_dof[num_domains]); + hypre_CSRMatrixI(domain_structure) = i_domain_dof; + hypre_CSRMatrixJ(domain_structure) = j_domain_dof; + hypre_CSRMatrixData(domain_structure) = domain_matrixinverse; + *domain_structure_pointer = domain_structure; return hypre_error_flag; - } HYPRE_Int hypre_ParMPSchwarzSolve(hypre_ParCSRMatrix *par_A, - hypre_CSRMatrix *A_boundary, - hypre_ParVector *rhs_vector, - hypre_CSRMatrix *domain_structure, - hypre_ParVector *par_x, - HYPRE_Real relax_wt, - HYPRE_Real *scale, - hypre_ParVector *Vtemp, HYPRE_Int *pivots, - HYPRE_Int use_nonsymm) - + hypre_CSRMatrix *A_boundary, + hypre_ParVector *rhs_vector, + hypre_CSRMatrix *domain_structure, + hypre_ParVector *par_x, + HYPRE_Real relax_wt, + HYPRE_Real *scale, + hypre_ParVector *Vtemp, HYPRE_Int *pivots, + HYPRE_Int use_nonsymm) { - hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(par_A); - HYPRE_Int num_sends = 0; - HYPRE_Int *send_map_starts; - HYPRE_Int *send_map_elmts; - - hypre_ParCSRCommHandle *comm_handle; - - HYPRE_Int ierr = 0; - /* HYPRE_Int num_dofs; */ - hypre_CSRMatrix *A_diag; - HYPRE_Int *A_diag_i; - HYPRE_Int *A_diag_j; - HYPRE_Real *A_diag_data; - hypre_CSRMatrix *A_offd; - HYPRE_Int *A_offd_i; - HYPRE_Int *A_offd_j; - HYPRE_Real *A_offd_data; - HYPRE_Real *x; - HYPRE_Real *x_ext; - HYPRE_Real *x_ext_old; - HYPRE_Real *rhs; - HYPRE_Real *rhs_ext; - HYPRE_Real *vtemp_data; - HYPRE_Real *aux; - HYPRE_Real *buf_data; + hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(par_A); + HYPRE_Int num_sends = 0; + HYPRE_Int *send_map_starts; + HYPRE_Int *send_map_elmts; + + hypre_ParCSRCommHandle *comm_handle; + + HYPRE_Int ierr = 0; + /* HYPRE_Int num_dofs; */ + hypre_CSRMatrix *A_diag; + HYPRE_Int *A_diag_i; + HYPRE_Int *A_diag_j; + HYPRE_Real *A_diag_data; + hypre_CSRMatrix *A_offd; + HYPRE_Int *A_offd_i; + HYPRE_Int *A_offd_j; + HYPRE_Real *A_offd_data; + HYPRE_Real *x; + HYPRE_Real *x_ext; + HYPRE_Real *x_ext_old; + HYPRE_Real *rhs; + HYPRE_Real *rhs_ext; + HYPRE_Real *vtemp_data; + HYPRE_Real *aux; + HYPRE_Real *buf_data; /*hypre_Vector *x_vector;*/ - MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); - HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); - HYPRE_Int max_domain_size = hypre_CSRMatrixNumCols(domain_structure); - HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); - HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); - HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); - HYPRE_Int *A_boundary_i; - HYPRE_Int *A_boundary_j; - HYPRE_Real *A_boundary_data; - HYPRE_Int num_variables; - HYPRE_Int num_cols_offd; - - - HYPRE_Int piv_counter = 0; - HYPRE_Int one = 1; - char uplo = 'L'; - - HYPRE_Int jj,i,j,k, j_loc, k_loc; - HYPRE_Int index; - - HYPRE_Int matrix_size, matrix_size_counter = 0; - - HYPRE_Int num_procs; - - hypre_MPI_Comm_size(comm,&num_procs); - - /* initiate: ----------------------------------------------- */ - /* num_dofs = hypre_CSRMatrixNumRows(A); */ - - A_diag = hypre_ParCSRMatrixDiag(par_A); - A_offd = hypre_ParCSRMatrixOffd(par_A); - num_variables = hypre_CSRMatrixNumRows(A_diag); - num_cols_offd = hypre_CSRMatrixNumCols(A_offd); - x = hypre_VectorData(hypre_ParVectorLocalVector(par_x)); - vtemp_data = hypre_VectorData(hypre_ParVectorLocalVector(Vtemp)); - rhs = hypre_VectorData(hypre_ParVectorLocalVector(rhs_vector)); + MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); + HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); + HYPRE_Int max_domain_size = hypre_CSRMatrixNumCols(domain_structure); + HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); + HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); + HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); + HYPRE_Int *A_boundary_i; + HYPRE_Int *A_boundary_j; + HYPRE_Real *A_boundary_data; + HYPRE_Int num_variables; + HYPRE_Int num_cols_offd; + + HYPRE_Int piv_counter = 0; + HYPRE_Int one = 1; + char uplo = 'L'; + + HYPRE_Int jj,i,j,k, j_loc, k_loc; + HYPRE_Int index; + + HYPRE_Int matrix_size, matrix_size_counter = 0; + + HYPRE_Int num_procs; + + hypre_MPI_Comm_size(comm,&num_procs); - if (use_nonsymm) - uplo = 'N'; + /* initiate: ----------------------------------------------- */ + /* num_dofs = hypre_CSRMatrixNumRows(A); */ + + A_diag = hypre_ParCSRMatrixDiag(par_A); + A_offd = hypre_ParCSRMatrixOffd(par_A); + num_variables = hypre_CSRMatrixNumRows(A_diag); + num_cols_offd = hypre_CSRMatrixNumCols(A_offd); + x = hypre_VectorData(hypre_ParVectorLocalVector(par_x)); + vtemp_data = hypre_VectorData(hypre_ParVectorLocalVector(Vtemp)); + rhs = hypre_VectorData(hypre_ParVectorLocalVector(rhs_vector)); + + if (use_nonsymm) + uplo = 'N'; /*x_vector = hypre_ParVectorLocalVector(par_x);*/ - A_diag_i = hypre_CSRMatrixI(A_diag); - A_diag_j = hypre_CSRMatrixJ(A_diag); - A_diag_data = hypre_CSRMatrixData(A_diag); - A_offd_i = hypre_CSRMatrixI(A_offd); - if (num_cols_offd) - { - A_offd_j = hypre_CSRMatrixJ(A_offd); - A_offd_data = hypre_CSRMatrixData(A_offd); - A_boundary_i = hypre_CSRMatrixI(A_boundary); - A_boundary_j = hypre_CSRMatrixJ(A_boundary); - A_boundary_data = hypre_CSRMatrixData(A_boundary); - } - aux = hypre_CTAlloc(HYPRE_Real, max_domain_size); + A_diag_i = hypre_CSRMatrixI(A_diag); + A_diag_j = hypre_CSRMatrixJ(A_diag); + A_diag_data = hypre_CSRMatrixData(A_diag); + A_offd_i = hypre_CSRMatrixI(A_offd); + if (num_cols_offd) + { + A_offd_j = hypre_CSRMatrixJ(A_offd); + A_offd_data = hypre_CSRMatrixData(A_offd); + A_boundary_i = hypre_CSRMatrixI(A_boundary); + A_boundary_j = hypre_CSRMatrixJ(A_boundary); + A_boundary_data = hypre_CSRMatrixData(A_boundary); + } + aux = hypre_CTAlloc(HYPRE_Real, max_domain_size); - hypre_ParVectorCopy(rhs_vector,Vtemp); - hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,Vtemp); + hypre_ParVectorCopy(rhs_vector,Vtemp); + hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,Vtemp); - if (comm_pkg) - { - num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); - send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg); - send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg); - - buf_data = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]); - x_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd); - x_ext_old = hypre_CTAlloc(HYPRE_Real, num_cols_offd); - rhs_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd); + if (comm_pkg) + { + num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg); + send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg); - index = 0; - for (i=0; i < num_sends; i++) - { - for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++) - buf_data[index++] = vtemp_data[send_map_elmts[j]]; - } + buf_data = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]); + x_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd); + x_ext_old = hypre_CTAlloc(HYPRE_Real, num_cols_offd); + rhs_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd); - comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data, - rhs_ext); - hypre_ParCSRCommHandleDestroy(comm_handle); - comm_handle = NULL; + index = 0; + for (i=0; i < num_sends; i++) + { + for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++) + buf_data[index++] = vtemp_data[send_map_elmts[j]]; + } - index = 0; - for (i=0; i < num_sends; i++) - { - for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++) - buf_data[index++] = x[send_map_elmts[j]]; - } + comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data, + rhs_ext); + hypre_ParCSRCommHandleDestroy(comm_handle); + comm_handle = NULL; - comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,x_ext); - hypre_ParCSRCommHandleDestroy(comm_handle); - comm_handle = NULL; - } + index = 0; + for (i=0; i < num_sends; i++) + { + for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++) + buf_data[index++] = x[send_map_elmts[j]]; + } + + comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,x_ext); + hypre_ParCSRCommHandleDestroy(comm_handle); + comm_handle = NULL; + } - /* correction of residual for exterior points to be updated locally */ + /* correction of residual for exterior points to be updated locally */ for (i=0; i < num_cols_offd; i++) { x_ext_old[i] = x_ext[i]; for (j = A_boundary_i[i]; j < A_boundary_i[i+1]; j++) { k_loc = A_boundary_j[j]; - if (k_loc < num_variables) - rhs_ext[i] += A_boundary_data[j]*x[k_loc]; - else - rhs_ext[i] += A_boundary_data[j]*x_ext[k_loc-num_variables]; + if (k_loc < num_variables) + rhs_ext[i] += A_boundary_data[j]*x[k_loc]; + else + rhs_ext[i] += A_boundary_data[j]*x_ext[k_loc-num_variables]; } } /* forward solve: ----------------------------------------------- */ @@ -581,227 +491,187 @@ jj = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - j_loc = j_domain_dof[j]; - if (j_loc < num_variables) - { - aux[jj] = rhs[j_loc]; - for (k=A_diag_i[j_loc]; k -1; i--) { matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; -/* OLD - HYPRE_USING_ESSL - matrix_size_counter -= matrix_size * (matrix_size+1)/2; -*/ matrix_size_counter -= matrix_size * matrix_size; piv_counter -= matrix_size; - + /* compute residual: ---------------------------------------- */ jj = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - j_loc = j_domain_dof[j]; - if (j_loc < num_variables) - { - aux[jj] = rhs[j_loc]; - for (k=A_diag_i[j_loc]; k 1) - hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); - else - rhs = hypre_VectorData(rhs_vector); + HYPRE_Int piv_counter = 0; + HYPRE_Int one = 1; + char uplo = 'L'; - /* forward solve: ----------------------------------------------- */ + HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ - matrix_size_counter = 0; - for (i=0; i < num_domains; i++) - { + + HYPRE_Int matrix_size, matrix_size_counter = 0; + + HYPRE_Int num_procs; + + hypre_MPI_Comm_size(comm,&num_procs); + + /* initiate: ----------------------------------------------- */ + /* num_dofs = hypre_CSRMatrixNumRows(A); */ + x_vector = hypre_ParVectorLocalVector(par_x); + A = hypre_ParCSRMatrixDiag(par_A); + i_dof_dof = hypre_CSRMatrixI(A); + j_dof_dof = hypre_CSRMatrixJ(A); + a_dof_dof = hypre_CSRMatrixData(A); + x = hypre_VectorData(x_vector); + aux = hypre_VectorData(aux_vector); + /* for (i=0; i < num_dofs; i++) + x[i] = 0.e0; */ + + if (use_nonsymm) + uplo = 'N'; + + + if (num_procs > 1) + hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); + else + rhs = hypre_VectorData(rhs_vector); + + /* forward solve: ----------------------------------------------- */ + + matrix_size_counter = 0; + for (i=0; i < num_domains; i++) + { matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; /* compute residual: ---------------------------------------- */ jj = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - aux[jj] = rhs[j_domain_dof[j]]; - for (k=i_dof_dof[j_domain_dof[j]]; - k 1) - { - hypre_TFree(rhs); - hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); - } - else - rhs = hypre_VectorData(rhs_vector); -*/ - /* backward solve: ------------------------------------------------ */ - for (i=num_domains-1; i > -1; i--) { - matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; -/* OLD - HYPRE_USING_ESSL - matrix_size_counter -= matrix_size * (matrix_size+1)/2; + hypre_TFree(rhs); + hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); + } + else + rhs = hypre_VectorData(rhs_vector); */ + /* backward solve: ------------------------------------------------ */ + for (i=num_domains-1; i > -1; i--) + { + matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; matrix_size_counter -= matrix_size * matrix_size; piv_counter -= matrix_size; - /* compute residual: ---------------------------------------- */ + jj = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - aux[jj] = rhs[j_domain_dof[j]]; - for (k=i_dof_dof[j_domain_dof[j]]; - k 1) hypre_TFree(rhs); + if (num_procs > 1) hypre_TFree(rhs); - return hypre_error_flag; + return hypre_error_flag; } HYPRE_Int hypre_MPSchwarzCFSolve(hypre_ParCSRMatrix *par_A, - hypre_Vector *rhs_vector, - hypre_CSRMatrix *domain_structure, - hypre_ParVector *par_x, - HYPRE_Real relax_wt, - hypre_Vector *aux_vector, - HYPRE_Int *CF_marker, - HYPRE_Int rlx_pt, HYPRE_Int *pivots, - HYPRE_Int use_nonsymm) + hypre_Vector *rhs_vector, + hypre_CSRMatrix *domain_structure, + hypre_ParVector *par_x, + HYPRE_Real relax_wt, + hypre_Vector *aux_vector, + HYPRE_Int *CF_marker, + HYPRE_Int rlx_pt, HYPRE_Int *pivots, + HYPRE_Int use_nonsymm) +{ + HYPRE_Int ierr = 0; + /* HYPRE_Int num_dofs; */ + HYPRE_Int *i_dof_dof; + HYPRE_Int *j_dof_dof; + HYPRE_Real *a_dof_dof; + HYPRE_Real *x; + HYPRE_Real *rhs; + HYPRE_Real *aux; + hypre_CSRMatrix *A; + hypre_Vector *x_vector; + MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); + HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); + HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); + HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); + HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); -{ - HYPRE_Int ierr = 0; - /* HYPRE_Int num_dofs; */ - HYPRE_Int *i_dof_dof; - HYPRE_Int *j_dof_dof; - HYPRE_Real *a_dof_dof; - HYPRE_Real *x; - HYPRE_Real *rhs; - HYPRE_Real *aux; - hypre_CSRMatrix *A; - hypre_Vector *x_vector; - MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); - HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); - HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); - HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); - HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); - - - HYPRE_Int piv_counter = 0; - HYPRE_Int one = 1; - char uplo = 'L'; - HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ - - - HYPRE_Int matrix_size, matrix_size_counter = 0; - - HYPRE_Int num_procs; - - hypre_MPI_Comm_size(comm,&num_procs); - - /* initiate: ----------------------------------------------- */ - /* num_dofs = hypre_CSRMatrixNumRows(A); */ - x_vector = hypre_ParVectorLocalVector(par_x); - A = hypre_ParCSRMatrixDiag(par_A); - i_dof_dof = hypre_CSRMatrixI(A); - j_dof_dof = hypre_CSRMatrixJ(A); - a_dof_dof = hypre_CSRMatrixData(A); - x = hypre_VectorData(x_vector); - aux = hypre_VectorData(aux_vector); - /* for (i=0; i < num_dofs; i++) - x[i] = 0.e0; */ - - if (use_nonsymm) - uplo = 'N'; - - if (num_procs > 1) - hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); - else - rhs = hypre_VectorData(rhs_vector); + HYPRE_Int piv_counter = 0; + HYPRE_Int one = 1; + char uplo = 'L'; + HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ + + HYPRE_Int matrix_size, matrix_size_counter = 0; + + HYPRE_Int num_procs; + + hypre_MPI_Comm_size(comm,&num_procs); + + /* initiate: ----------------------------------------------- */ + /* num_dofs = hypre_CSRMatrixNumRows(A); */ + x_vector = hypre_ParVectorLocalVector(par_x); + A = hypre_ParCSRMatrixDiag(par_A); + i_dof_dof = hypre_CSRMatrixI(A); + j_dof_dof = hypre_CSRMatrixJ(A); + a_dof_dof = hypre_CSRMatrixData(A); + x = hypre_VectorData(x_vector); + aux = hypre_VectorData(aux_vector); + /* for (i=0; i < num_dofs; i++) + x[i] = 0.e0; */ - /* forward solve: ----------------------------------------------- */ + if (use_nonsymm) + uplo = 'N'; - matrix_size_counter = 0; - for (i=0; i < num_domains; i++) - { - if (CF_marker[i] == rlx_pt) - { - matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; + if (num_procs > 1) + hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); + else + rhs = hypre_VectorData(rhs_vector); - /* compute residual: ---------------------------------------- */ + /* forward solve: ----------------------------------------------- */ - jj = 0; - for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - aux[jj] = rhs[j_domain_dof[j]]; - if (CF_marker[j_domain_dof[j]] == rlx_pt) - { - for (k=i_dof_dof[j_domain_dof[j]]; - k 1) - { - hypre_TFree(rhs); - hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); - } - else - rhs = hypre_VectorData(rhs_vector); -*/ - /* backward solve: ------------------------------------------------ */ - for (i=num_domains-1; i > -1; i--) { + hypre_TFree(rhs); + hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); + } + else + rhs = hypre_VectorData(rhs_vector); +*/ + /* backward solve: ------------------------------------------------ */ + for (i=num_domains-1; i > -1; i--) + { if (CF_marker[i] == rlx_pt) { - matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; -/* OLD - HYPRE_USING_ESSL - matrix_size_counter -= matrix_size * (matrix_size+1)/2; -*/ - matrix_size_counter -= matrix_size * matrix_size; - piv_counter -= matrix_size; + matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; + matrix_size_counter -= matrix_size * matrix_size; + piv_counter -= matrix_size; + + /* compute residual: ---------------------------------------- */ + jj = 0; + for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) + { + aux[jj] = rhs[j_domain_dof[j]]; + if (CF_marker[j_domain_dof[j]] == rlx_pt) + { + for (k=i_dof_dof[j_domain_dof[j]]; + k 1) hypre_TFree(rhs); + if (num_procs > 1) hypre_TFree(rhs); - return hypre_error_flag; + return hypre_error_flag; } HYPRE_Int hypre_MPSchwarzFWSolve(hypre_ParCSRMatrix *par_A, - hypre_Vector *rhs_vector, - hypre_CSRMatrix *domain_structure, - hypre_ParVector *par_x, - HYPRE_Real relax_wt, - hypre_Vector *aux_vector, HYPRE_Int *pivots, - HYPRE_Int use_nonsymm) + hypre_Vector *rhs_vector, + hypre_CSRMatrix *domain_structure, + hypre_ParVector *par_x, + HYPRE_Real relax_wt, + hypre_Vector *aux_vector, HYPRE_Int *pivots, + HYPRE_Int use_nonsymm) +{ + HYPRE_Int ierr = 0; + /* HYPRE_Int num_dofs; */ + HYPRE_Int *i_dof_dof; + HYPRE_Int *j_dof_dof; + HYPRE_Real *a_dof_dof; + HYPRE_Real *x; + HYPRE_Real *rhs; + HYPRE_Real *aux; + hypre_CSRMatrix *A; + hypre_Vector *x_vector; + MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); + HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); + HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); + HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); + HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); -{ - HYPRE_Int ierr = 0; - /* HYPRE_Int num_dofs; */ - HYPRE_Int *i_dof_dof; - HYPRE_Int *j_dof_dof; - HYPRE_Real *a_dof_dof; - HYPRE_Real *x; - HYPRE_Real *rhs; - HYPRE_Real *aux; - hypre_CSRMatrix *A; - hypre_Vector *x_vector; - MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); - HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); - HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); - HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); - HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); - - - HYPRE_Int piv_counter = 0; - HYPRE_Int one = 1; - char uplo = 'L'; - HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ - - - HYPRE_Int matrix_size, matrix_size_counter = 0; - - HYPRE_Int num_procs; - - hypre_MPI_Comm_size(comm,&num_procs); - - /* initiate: ----------------------------------------------- */ - /* num_dofs = hypre_CSRMatrixNumRows(A); */ - x_vector = hypre_ParVectorLocalVector(par_x); - A = hypre_ParCSRMatrixDiag(par_A); - i_dof_dof = hypre_CSRMatrixI(A); - j_dof_dof = hypre_CSRMatrixJ(A); - a_dof_dof = hypre_CSRMatrixData(A); - x = hypre_VectorData(x_vector); - aux = hypre_VectorData(aux_vector); - /* for (i=0; i < num_dofs; i++) - x[i] = 0.e0; */ - - if (num_procs > 1) - hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); - else - rhs = hypre_VectorData(rhs_vector); - /* forward solve: ----------------------------------------------- */ + HYPRE_Int piv_counter = 0; + HYPRE_Int one = 1; + char uplo = 'L'; + HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ - matrix_size_counter = 0; - for (i=0; i < num_domains; i++) - { + + HYPRE_Int matrix_size, matrix_size_counter = 0; + + HYPRE_Int num_procs; + + hypre_MPI_Comm_size(comm,&num_procs); + + /* initiate: ----------------------------------------------- */ + /* num_dofs = hypre_CSRMatrixNumRows(A); */ + x_vector = hypre_ParVectorLocalVector(par_x); + A = hypre_ParCSRMatrixDiag(par_A); + i_dof_dof = hypre_CSRMatrixI(A); + j_dof_dof = hypre_CSRMatrixJ(A); + a_dof_dof = hypre_CSRMatrixData(A); + x = hypre_VectorData(x_vector); + aux = hypre_VectorData(aux_vector); + /* for (i=0; i < num_dofs; i++) + x[i] = 0.e0; */ + + if (num_procs > 1) + hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); + else + rhs = hypre_VectorData(rhs_vector); + + /* forward solve: ----------------------------------------------- */ + + matrix_size_counter = 0; + for (i=0; i < num_domains; i++) + { matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; /* compute residual: ---------------------------------------- */ jj = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - aux[jj] = rhs[j_domain_dof[j]]; - for (k=i_dof_dof[j_domain_dof[j]]; - k 1) hypre_TFree(rhs); - - return hypre_error_flag; + if (num_procs > 1) hypre_TFree(rhs); + return hypre_error_flag; } HYPRE_Int hypre_MPSchwarzCFFWSolve(hypre_ParCSRMatrix *par_A, - hypre_Vector *rhs_vector, - hypre_CSRMatrix *domain_structure, - hypre_ParVector *par_x, - HYPRE_Real relax_wt, - hypre_Vector *aux_vector, - HYPRE_Int *CF_marker, - HYPRE_Int rlx_pt, HYPRE_Int *pivots, - HYPRE_Int use_nonsymm) + hypre_Vector *rhs_vector, + hypre_CSRMatrix *domain_structure, + hypre_ParVector *par_x, + HYPRE_Real relax_wt, + hypre_Vector *aux_vector, + HYPRE_Int *CF_marker, + HYPRE_Int rlx_pt, HYPRE_Int *pivots, + HYPRE_Int use_nonsymm) +{ + HYPRE_Int ierr = 0; + /* HYPRE_Int num_dofs; */ + HYPRE_Int *i_dof_dof; + HYPRE_Int *j_dof_dof; + HYPRE_Real *a_dof_dof; + HYPRE_Real *x; + HYPRE_Real *rhs; + HYPRE_Real *aux; + hypre_CSRMatrix *A; + hypre_Vector *x_vector; + MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); + HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); + HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); + HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); + HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); -{ - HYPRE_Int ierr = 0; - /* HYPRE_Int num_dofs; */ - HYPRE_Int *i_dof_dof; - HYPRE_Int *j_dof_dof; - HYPRE_Real *a_dof_dof; - HYPRE_Real *x; - HYPRE_Real *rhs; - HYPRE_Real *aux; - hypre_CSRMatrix *A; - hypre_Vector *x_vector; - MPI_Comm comm = hypre_ParCSRMatrixComm(par_A); - HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); - HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); - HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); - HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure); - - HYPRE_Int piv_counter = 0; - HYPRE_Int one = 1; - - char uplo = 'L'; - HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ - - - HYPRE_Int matrix_size, matrix_size_counter = 0; - - HYPRE_Int num_procs; - - hypre_MPI_Comm_size(comm,&num_procs); - - /* initiate: ----------------------------------------------- */ - /* num_dofs = hypre_CSRMatrixNumRows(A); */ - x_vector = hypre_ParVectorLocalVector(par_x); - A = hypre_ParCSRMatrixDiag(par_A); - i_dof_dof = hypre_CSRMatrixI(A); - j_dof_dof = hypre_CSRMatrixJ(A); - a_dof_dof = hypre_CSRMatrixData(A); - x = hypre_VectorData(x_vector); - aux = hypre_VectorData(aux_vector); - /* for (i=0; i < num_dofs; i++) - x[i] = 0.e0; */ - - if (use_nonsymm) - uplo = 'N'; - - if (num_procs > 1) - hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); - else - rhs = hypre_VectorData(rhs_vector); + HYPRE_Int piv_counter = 0; + HYPRE_Int one = 1; - /* forward solve: ----------------------------------------------- */ + char uplo = 'L'; + HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/ - matrix_size_counter = 0; - for (i=0; i < num_domains; i++) - { - if (CF_marker[i] == rlx_pt) - { - matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; - /* compute residual: ---------------------------------------- */ + HYPRE_Int matrix_size, matrix_size_counter = 0; - jj = 0; - for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - aux[jj] = rhs[j_domain_dof[j]]; - if (CF_marker[j_domain_dof[j]] == rlx_pt) - { - for (k=i_dof_dof[j_domain_dof[j]]; - k 1) + hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs); + else + rhs = hypre_VectorData(rhs_vector); + + /* forward solve: ----------------------------------------------- */ + + matrix_size_counter = 0; + for (i=0; i < num_domains; i++) + { + if (CF_marker[i] == rlx_pt) { - -#ifdef HYPRE_USING_ESSL - dpotrs(&uplo, matrix_size, one, - &domain_matrixinverse[matrix_size_counter], - matrix_size, aux, - matrix_size, &ierr); -#else - hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, - &domain_matrixinverse[matrix_size_counter], - &matrix_size, aux, - &matrix_size, &ierr); -#endif - } - - if (ierr) hypre_error(HYPRE_ERROR_GENERIC); - jj = 0; - for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - x[j_domain_dof[j]]+= relax_wt * aux[jj++]; - } -/* OLD - HYPRE_USING_ESSL - matrix_size_counter += matrix_size * (matrix_size+1)/2; -*/ - matrix_size_counter += matrix_size * matrix_size; - piv_counter += matrix_size; + matrix_size = i_domain_dof[i+1] - i_domain_dof[i]; + /* compute residual: ---------------------------------------- */ - } - } + jj = 0; + for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) + { + aux[jj] = rhs[j_domain_dof[j]]; + if (CF_marker[j_domain_dof[j]] == rlx_pt) + { + for (k=i_dof_dof[j_domain_dof[j]]; + k 1) hypre_TFree(rhs); + else + { + hypre_dpotrs(&uplo, &matrix_size, &one, + &domain_matrixinverse[matrix_size_counter], + &matrix_size, aux, + &matrix_size, &ierr); + } + + if (ierr) hypre_error(HYPRE_ERROR_GENERIC); + jj = 0; + for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) + { + x[j_domain_dof[j]]+= relax_wt * aux[jj++]; + } + matrix_size_counter += matrix_size * matrix_size; + piv_counter += matrix_size; + } + } - return hypre_error_flag; + if (num_procs > 1) hypre_TFree(rhs); + return hypre_error_flag; } -HYPRE_Int +HYPRE_Int transpose_matrix_create( HYPRE_Int **i_face_element_pointer, - HYPRE_Int **j_face_element_pointer, - - HYPRE_Int *i_element_face, HYPRE_Int *j_element_face, - - HYPRE_Int num_elements, HYPRE_Int num_faces) - + HYPRE_Int **j_face_element_pointer, + HYPRE_Int *i_element_face, HYPRE_Int *j_element_face, + HYPRE_Int num_elements, HYPRE_Int num_faces) { - /* FILE *f; */ - HYPRE_Int i, j; + /* FILE *f; */ + HYPRE_Int i, j; - HYPRE_Int *i_face_element, *j_face_element; + HYPRE_Int *i_face_element, *j_face_element; - /* ====================================================================== - first create face_element graph: ------------------------------------- - ====================================================================== */ + /* ====================================================================== + first create face_element graph: ------------------------------------- + ====================================================================== */ - i_face_element = (HYPRE_Int *) malloc((num_faces+1) * sizeof(HYPRE_Int)); - j_face_element = (HYPRE_Int *) malloc(i_element_face[num_elements] * sizeof(HYPRE_Int)); + i_face_element = (HYPRE_Int *) malloc((num_faces+1) * sizeof(HYPRE_Int)); + j_face_element = (HYPRE_Int *) malloc(i_element_face[num_elements] * sizeof(HYPRE_Int)); - for (i=0; i < num_faces; i++) - i_face_element[i] = 0; + for (i=0; i < num_faces; i++) + i_face_element[i] = 0; - for (i=0; i < num_elements; i++) - for (j=i_element_face[i]; j < i_element_face[i+1]; j++) - i_face_element[j_element_face[j]]++; + for (i=0; i < num_elements; i++) + for (j=i_element_face[i]; j < i_element_face[i+1]; j++) + i_face_element[j_element_face[j]]++; - i_face_element[num_faces] = i_element_face[num_elements]; + i_face_element[num_faces] = i_element_face[num_elements]; - for (i=num_faces-1; i > -1; i--) - i_face_element[i] = i_face_element[i+1] - i_face_element[i]; + for (i=num_faces-1; i > -1; i--) + i_face_element[i] = i_face_element[i+1] - i_face_element[i]; - for (i=0; i < num_elements; i++) - for (j=i_element_face[i]; j < i_element_face[i+1]; j++) + for (i=0; i < num_elements; i++) + for (j=i_element_face[i]; j < i_element_face[i+1]; j++) { - j_face_element[i_face_element[j_element_face[j]]] = i; - i_face_element[j_element_face[j]]++; + j_face_element[i_face_element[j_element_face[j]]] = i; + i_face_element[j_element_face[j]]++; } - for (i=num_faces-1; i > -1; i--) - i_face_element[i+1] = i_face_element[i]; + for (i=num_faces-1; i > -1; i--) + i_face_element[i+1] = i_face_element[i]; - i_face_element[0] = 0; + i_face_element[0] = 0; - /* hypre_printf("end building face--element graph: ++++++++++++++++++\n"); */ + /* hypre_printf("end building face--element graph: ++++++++++++++++++\n"); */ - /* END building face_element graph; ================================ */ + /* END building face_element graph; ================================ */ - *i_face_element_pointer = i_face_element; - *j_face_element_pointer = j_face_element; + *i_face_element_pointer = i_face_element; + *j_face_element_pointer = j_face_element; - return 0; + return 0; } -HYPRE_Int -matrix_matrix_product( HYPRE_Int **i_element_edge_pointer, - HYPRE_Int **j_element_edge_pointer, - - HYPRE_Int *i_element_face, HYPRE_Int *j_element_face, - HYPRE_Int *i_face_edge, HYPRE_Int *j_face_edge, - - HYPRE_Int num_elements, HYPRE_Int num_faces, HYPRE_Int num_edges) - +HYPRE_Int +matrix_matrix_product( HYPRE_Int **i_element_edge_pointer, + HYPRE_Int **j_element_edge_pointer, + HYPRE_Int *i_element_face, HYPRE_Int *j_element_face, + HYPRE_Int *i_face_edge, HYPRE_Int *j_face_edge, + HYPRE_Int num_elements, HYPRE_Int num_faces, HYPRE_Int num_edges) { - /* FILE *f; */ - HYPRE_Int i, j, k, l, m; - - HYPRE_Int i_edge_on_local_list, i_edge_on_list; - HYPRE_Int local_element_edge_counter = 0, element_edge_counter = 0; - HYPRE_Int *j_local_element_edge; + /* FILE *f; */ + HYPRE_Int i, j, k, l, m; - - HYPRE_Int *i_element_edge, *j_element_edge; + HYPRE_Int i_edge_on_local_list, i_edge_on_list; + HYPRE_Int local_element_edge_counter = 0, element_edge_counter = 0; + HYPRE_Int *j_local_element_edge; + HYPRE_Int *i_element_edge, *j_element_edge; - j_local_element_edge = (HYPRE_Int *) malloc((num_edges+1) * sizeof(HYPRE_Int)); + j_local_element_edge = (HYPRE_Int *) malloc((num_edges+1) * sizeof(HYPRE_Int)); - i_element_edge = (HYPRE_Int *) malloc((num_elements+1) * sizeof(HYPRE_Int)); + i_element_edge = (HYPRE_Int *) malloc((num_elements+1) * sizeof(HYPRE_Int)); - for (i=0; i < num_elements+1; i++) - i_element_edge[i] = 0; + for (i=0; i < num_elements+1; i++) + i_element_edge[i] = 0; - for (i=0; i < num_elements; i++) - { + for (i=0; i < num_elements; i++) + { local_element_edge_counter = 0; for (j=i_element_face[i]; j < i_element_face[i+1]; j++) - { - k = j_element_face[j]; + { + k = j_element_face[j]; - for (l=i_face_edge[k]; l < i_face_edge[k+1]; l++) - { - /* element i and edge j_face_edge[l] are connected */ - - /* hypre_printf("element %d contains edge %d;\n", - i, j_face_edge[l]); */ - - i_edge_on_local_list = -1; - for (m=0; m < local_element_edge_counter; m++) - if (j_local_element_edge[m] == j_face_edge[l]) - { - i_edge_on_local_list++; - break; - } - - if (i_edge_on_local_list == -1) - { - i_element_edge[i]++; - j_local_element_edge[local_element_edge_counter]= - j_face_edge[l]; - local_element_edge_counter++; - } - } - } - } + for (l=i_face_edge[k]; l < i_face_edge[k+1]; l++) + { + /* element i and edge j_face_edge[l] are connected */ + + /* hypre_printf("element %d contains edge %d;\n", + i, j_face_edge[l]); */ + + i_edge_on_local_list = -1; + for (m=0; m < local_element_edge_counter; m++) + if (j_local_element_edge[m] == j_face_edge[l]) + { + i_edge_on_local_list++; + break; + } + + if (i_edge_on_local_list == -1) + { + i_element_edge[i]++; + j_local_element_edge[local_element_edge_counter]= + j_face_edge[l]; + local_element_edge_counter++; + } + } + } + } - free(j_local_element_edge); + free(j_local_element_edge); - for (i=0; i < num_elements; i++) - i_element_edge[i+1] += i_element_edge[i]; + for (i=0; i < num_elements; i++) + i_element_edge[i+1] += i_element_edge[i]; - for (i=num_elements; i>0; i--) - i_element_edge[i] = i_element_edge[i-1]; + for (i=num_elements; i>0; i--) + i_element_edge[i] = i_element_edge[i-1]; - i_element_edge[0] = 0; + i_element_edge[0] = 0; - j_element_edge = (HYPRE_Int *) malloc(i_element_edge[num_elements] - * sizeof(HYPRE_Int)); + j_element_edge = (HYPRE_Int *) malloc(i_element_edge[num_elements] + * sizeof(HYPRE_Int)); - /* fill--in the actual j_element_edge array: --------------------- */ + /* fill--in the actual j_element_edge array: --------------------- */ - element_edge_counter = 0; - for (i=0; i < num_elements; i++) - { + element_edge_counter = 0; + for (i=0; i < num_elements; i++) + { i_element_edge[i] = element_edge_counter; for (j=i_element_face[i]; j < i_element_face[i+1]; j++) - { - for (k=i_face_edge[j_element_face[j]]; - k= - i_element_edge[num_elements]) - { - hypre_error_w_msg(HYPRE_ERROR_GENERIC,"error in j_element_edge size: \n"); - break; - } - - j_element_edge[element_edge_counter] = - j_face_edge[k]; - element_edge_counter++; - } - } - } - - } + { + for (k=i_face_edge[j_element_face[j]]; + k= + i_element_edge[num_elements]) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC,"error in j_element_edge size: \n"); + break; + } + + j_element_edge[element_edge_counter] = + j_face_edge[k]; + element_edge_counter++; + } + } + } + + } + + i_element_edge[num_elements] = element_edge_counter; - fclose(f); - */ + /*------------------------------------------------------------------ + f = fopen("element_edge", "w"); + for (i=0; i < num_elements; i++) + { + hypre_printf("\nelement: %d has edges:\n", i); + for (j=i_element_edge[i]; j < i_element_edge[i+1]; j++) + { + hypre_printf("%d ", j_element_edge[j]); + hypre_fprintf(f, "%d %d\n", i, j_element_edge[j]); + } + + hypre_printf("\n"); + } - /* hypre_printf("end element_edge computation: ++++++++++++++++++++++++ \n");*/ + fclose(f); + */ - *i_element_edge_pointer = i_element_edge; - *j_element_edge_pointer = j_element_edge; + /* hypre_printf("end element_edge computation: ++++++++++++++++++++++++ \n");*/ - return hypre_error_flag; + *i_element_edge_pointer = i_element_edge; + *j_element_edge_pointer = j_element_edge; + return hypre_error_flag; } - /*-------------------------------------------------------------------------- - * hypre_AMGCreateDomainDof: + * hypre_AMGCreateDomainDof: *--------------------------------------------------------------------------*/ /***************************************************************************** @@ -1736,1159 +1445,1032 @@ * Routine for constructing graph domain_dof with minimal overlap * and computing the respective matrix inverses to be * used in an overlapping Schwarz procedure (like smoother - * in AMG); + * in AMG); * *****************************************************************************/ HYPRE_Int hypre_AMGCreateDomainDof(hypre_CSRMatrix *A, - HYPRE_Int domain_type, HYPRE_Int overlap, - HYPRE_Int num_functions, HYPRE_Int *dof_func, - hypre_CSRMatrix **domain_structure_pointer, + HYPRE_Int domain_type, HYPRE_Int overlap, + HYPRE_Int num_functions, HYPRE_Int *dof_func, + hypre_CSRMatrix **domain_structure_pointer, HYPRE_Int **piv_pointer, HYPRE_Int use_nonsymm) { - HYPRE_Int *i_domain_dof, *j_domain_dof; - HYPRE_Real *domain_matrixinverse; - HYPRE_Int num_domains; - hypre_CSRMatrix *domain_structure = NULL; - - HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A); - HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A); - HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A); - HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A); + HYPRE_Int *i_domain_dof, *j_domain_dof; + HYPRE_Real *domain_matrixinverse; + HYPRE_Int num_domains; + hypre_CSRMatrix *domain_structure = NULL; - /* HYPRE_Int *i_dof_to_accept_weight; */ - HYPRE_Int *i_dof_to_prefer_weight, - *w_dof_dof, *i_dof_weight; - HYPRE_Int *i_dof_to_aggregate, *i_aggregate_dof, *j_aggregate_dof; - - HYPRE_Int *i_dof_index; + HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A); + HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A); + HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A); + HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A); - HYPRE_Int ierr = 0; - HYPRE_Int i,j,k, l_loc, i_loc, j_loc; - HYPRE_Int i_dof; - HYPRE_Int *i_local_to_global; - HYPRE_Int *i_global_to_local; + /* HYPRE_Int *i_dof_to_accept_weight; */ + HYPRE_Int *i_dof_to_prefer_weight, + *w_dof_dof, *i_dof_weight; + HYPRE_Int *i_dof_to_aggregate, *i_aggregate_dof, *j_aggregate_dof; + HYPRE_Int *i_dof_index; - HYPRE_Int local_dof_counter, max_local_dof_counter=0; + HYPRE_Int ierr = 0; + HYPRE_Int i,j,k, l_loc, i_loc, j_loc; + HYPRE_Int i_dof; + HYPRE_Int *i_local_to_global; + HYPRE_Int *i_global_to_local; - HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0; - HYPRE_Int nf; + HYPRE_Int local_dof_counter, max_local_dof_counter=0; - HYPRE_Real *AE; + HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0; + HYPRE_Int nf; - HYPRE_Int piv_counter = 0; - HYPRE_Int *ipiv; - HYPRE_Int *piv = NULL; - char uplo = 'L'; - HYPRE_Int cnt; + HYPRE_Real *AE; + HYPRE_Int piv_counter = 0; + HYPRE_Int *ipiv; + HYPRE_Int *piv = NULL; + char uplo = 'L'; + HYPRE_Int cnt; + /* --------------------------------------------------------------------- */ - /* --------------------------------------------------------------------- */ + /*=======================================================================*/ + /* create artificial domains by agglomeration; */ + /*=======================================================================*/ - /*=======================================================================*/ - /* create artificial domains by agglomeration; */ - /*=======================================================================*/ + /*hypre_printf("----------- create artificials domain by agglomeration; ======\n"); + */ - /*hypre_printf("----------- create artificials domain by agglomeration; ======\n"); -*/ + if (num_dofs == 0) + { + *domain_structure_pointer = domain_structure; - if (num_dofs == 0) - { - *domain_structure_pointer = domain_structure; + *piv_pointer = piv; - *piv_pointer = piv; + return hypre_error_flag; + } - return hypre_error_flag; - } + i_aggregate_dof = hypre_CTAlloc(HYPRE_Int,num_dofs+1); + j_aggregate_dof= hypre_CTAlloc(HYPRE_Int,num_dofs); - i_aggregate_dof = hypre_CTAlloc(HYPRE_Int,num_dofs+1); - j_aggregate_dof= hypre_CTAlloc(HYPRE_Int,num_dofs); + if (domain_type == 2) + { + i_dof_to_prefer_weight = hypre_CTAlloc(HYPRE_Int,num_dofs); + w_dof_dof = hypre_CTAlloc(HYPRE_Int,i_dof_dof[num_dofs]); + i_dof_weight = hypre_CTAlloc(HYPRE_Int,num_dofs); - if (domain_type == 2) - { - i_dof_to_prefer_weight = hypre_CTAlloc(HYPRE_Int,num_dofs); - w_dof_dof = hypre_CTAlloc(HYPRE_Int,i_dof_dof[num_dofs]); - i_dof_weight = hypre_CTAlloc(HYPRE_Int,num_dofs); + for (i=0; i= i - && i_dof_index[j_dof_dof[k]]==-1) - { - i_dof_index[j_dof_dof[k]]++; - domain_dof_counter++; - } + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + for (k=i_dof_dof[j_aggregate_dof[j]]; + k= i + && i_dof_index[j_dof_dof[k]]==-1) + { + i_dof_index[j_dof_dof[k]]++; + domain_dof_counter++; + } + } - } + i_domain_dof[num_domains] = domain_dof_counter; + j_domain_dof = hypre_CTAlloc(HYPRE_Int,domain_dof_counter); - i_domain_dof[num_domains] = domain_dof_counter; - j_domain_dof = hypre_CTAlloc(HYPRE_Int,domain_dof_counter); + for (i=0; i < num_dofs; i++) + i_dof_index[i] = -1; - for (i=0; i < num_dofs; i++) - i_dof_index[i] = -1; + domain_dof_counter=0; + for (i=0; i < num_domains; i++) + { + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + i_dof_index[j_aggregate_dof[j]]=-1; - domain_dof_counter=0; - for (i=0; i < num_domains; i++) - { - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - i_dof_index[j_aggregate_dof[j]]=-1; - - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - for (k=i_dof_dof[j_aggregate_dof[j]]; - k= i - && i_dof_index[j_dof_dof[k]]==-1) - { - i_dof_index[j_dof_dof[k]]++; - j_domain_dof[domain_dof_counter] = j_dof_dof[k]; - domain_dof_counter++; - } + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + for (k=i_dof_dof[j_aggregate_dof[j]]; + k= i + && i_dof_index[j_dof_dof[k]]==-1) + { + i_dof_index[j_dof_dof[k]]++; + j_domain_dof[domain_dof_counter] = j_dof_dof[k]; + domain_dof_counter++; + } + } - } + hypre_TFree(i_aggregate_dof); + hypre_TFree(j_aggregate_dof); + hypre_TFree(i_dof_to_aggregate); + hypre_TFree(i_dof_index); + } + else if (overlap == 2) + { + i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1); - hypre_TFree(i_aggregate_dof); - hypre_TFree(j_aggregate_dof); - hypre_TFree(i_dof_to_aggregate); - hypre_TFree(i_dof_index); - } - else if (overlap == 2) - { - i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1); + i_dof_index = hypre_CTAlloc(HYPRE_Int, num_dofs); - i_dof_index = hypre_CTAlloc(HYPRE_Int, num_dofs); + for (i=0; i < num_dofs; i++) + i_dof_index[i] = -1; - for (i=0; i < num_dofs; i++) - i_dof_index[i] = -1; + domain_dof_counter=0; + for (i=0; i < num_domains; i++) + { + i_domain_dof[i] = domain_dof_counter; + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + for (k=i_dof_dof[j_aggregate_dof[j]]; + k max_local_dof_counter) - max_local_dof_counter = local_dof_counter; - } - - domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter); - if (use_nonsymm) - piv = hypre_CTAlloc(HYPRE_Int, piv_counter); - - - - - /* OLD-HYPRE_USING_ESSL - AE = hypre_CTAlloc(HYPRE_Real, max_local_dof_counter*max_local_dof_counter); - */ + max_local_dof_counter = local_dof_counter; + } - i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter); + domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter); + if (use_nonsymm) + piv = hypre_CTAlloc(HYPRE_Int, piv_counter); + i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter); - i_global_to_local = hypre_CTAlloc(HYPRE_Int,num_dofs); + i_global_to_local = hypre_CTAlloc(HYPRE_Int,num_dofs); - for (i=0; i < num_dofs; i++) - i_global_to_local[i] = -1; + for (i=0; i < num_dofs; i++) + i_global_to_local[i] = -1; - piv_counter = 0; - domain_matrixinverse_counter = 0; - for (i=0; i < num_domains; i++) - { + piv_counter = 0; + domain_matrixinverse_counter = 0; + for (i=0; i < num_domains; i++) + { local_dof_counter = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) { - i_global_to_local[j_domain_dof[j]] = local_dof_counter; - i_local_to_global[local_dof_counter] = j_domain_dof[j]; - local_dof_counter++; + i_global_to_local[j_domain_dof[j]] = local_dof_counter; + i_local_to_global[local_dof_counter] = j_domain_dof[j]; + local_dof_counter++; } - /* get local matrix in AE: ======================================== */ - cnt = 0; + cnt = 0; - /* OLD - HYPRE_USING_ESSL - for (i_loc=0; i_loc < local_dof_counter; i_loc++) - for (j_loc=0; j_loc < local_dof_counter; j_loc++) - AE[cnt++] = 0.e0; - for (i_loc=0; i_loc < local_dof_counter; i_loc++) - { - i_dof = i_local_to_global[i_loc]; - for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++) - { - j_loc = i_global_to_local[j_dof_dof[j]]; - if (j_loc >=0) - AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j]; - } - } - cnt = local_dof_counter; - for (j_loc=1; j_loc < local_dof_counter; j_loc++) - for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++) - AE[cnt++] = AE[i_loc + j_loc * local_dof_counter]; - dppf(AE, local_dof_counter, 1); - cnt = domain_matrixinverse_counter; - size = local_dof_counter*(local_dof_counter+1)/2; - for (i_loc = 0; i_loc < size ; i_loc++) - domain_matrixinverse[cnt++] = AE[i_loc]; - domain_matrixinverse_counter += size; - */ AE = &domain_matrixinverse[domain_matrixinverse_counter]; ipiv = &piv[piv_counter]; for (i_loc=0; i_loc < local_dof_counter; i_loc++) - for (j_loc=0; j_loc < local_dof_counter; j_loc++) - AE[cnt++] = 0.e0; + for (j_loc=0; j_loc < local_dof_counter; j_loc++) + AE[cnt++] = 0.e0; for (i_loc=0; i_loc < local_dof_counter; i_loc++) { - i_dof = i_local_to_global[i_loc]; - for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++) - { - j_loc = i_global_to_local[j_dof_dof[j]]; - if (j_loc >=0) - AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j]; - } + i_dof = i_local_to_global[i_loc]; + for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++) + { + j_loc = i_global_to_local[j_dof_dof[j]]; + if (j_loc >=0) + AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j]; + } } if (use_nonsymm) { -#ifdef HYPRE_USING_ESSL - dgetrf(local_dof_counter,local_dof_counter, AE, - local_dof_counter, ipiv, &ierr); -#else - hypre_F90_NAME_LAPACK(dgetrf,DGETRF)(&local_dof_counter, - &local_dof_counter, AE, - &local_dof_counter, ipiv, &ierr); -#endif + hypre_dgetrf(&local_dof_counter, + &local_dof_counter, AE, + &local_dof_counter, ipiv, &ierr); piv_counter +=local_dof_counter; } - + else { -#ifdef HYPRE_USING_ESSL - dpotrf(&uplo,local_dof_counter, AE, - local_dof_counter, &ierr); -#else - hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo,&local_dof_counter, AE, - &local_dof_counter, &ierr); -#endif - + hypre_dpotrf(&uplo,&local_dof_counter, AE, + &local_dof_counter, &ierr); } - - domain_matrixinverse_counter+=local_dof_counter*local_dof_counter; + + domain_matrixinverse_counter+=local_dof_counter*local_dof_counter; for (l_loc=0; l_loc < local_dof_counter; l_loc++) - i_global_to_local[i_local_to_global[l_loc]] = -1; - - } + i_global_to_local[i_local_to_global[l_loc]] = -1; - hypre_TFree(i_local_to_global); - hypre_TFree(i_global_to_local); -/* OLD - HYPRE_USING_ESSL - hypre_TFree(AE); -*/ - - domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, - i_domain_dof[num_domains]); - - hypre_CSRMatrixI(domain_structure) = i_domain_dof; - hypre_CSRMatrixJ(domain_structure) = j_domain_dof; - hypre_CSRMatrixData(domain_structure) = domain_matrixinverse; + } - *domain_structure_pointer = domain_structure; + hypre_TFree(i_local_to_global); + hypre_TFree(i_global_to_local); - *piv_pointer = piv; + domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, + i_domain_dof[num_domains]); - return hypre_error_flag; + hypre_CSRMatrixI(domain_structure) = i_domain_dof; + hypre_CSRMatrixJ(domain_structure) = j_domain_dof; + hypre_CSRMatrixData(domain_structure) = domain_matrixinverse; -} + *domain_structure_pointer = domain_structure; + *piv_pointer = piv; -/* unacceptable faces: i_face_to_prefer_weight[] = -1; ------------------*/ + return hypre_error_flag; +} +/* unacceptable faces: i_face_to_prefer_weight[] = -1; ------------------*/ HYPRE_Int hypre_AMGeAgglomerate(HYPRE_Int *i_AE_element, HYPRE_Int *j_AE_element, - - HYPRE_Int *i_face_face, HYPRE_Int *j_face_face, HYPRE_Int *w_face_face, - - HYPRE_Int *i_face_element, HYPRE_Int *j_face_element, - HYPRE_Int *i_element_face, HYPRE_Int *j_element_face, - - HYPRE_Int *i_face_to_prefer_weight, - HYPRE_Int *i_face_weight, - - HYPRE_Int num_faces, HYPRE_Int num_elements, - HYPRE_Int *num_AEs_pointer) + HYPRE_Int *i_face_face, HYPRE_Int *j_face_face, HYPRE_Int *w_face_face, + HYPRE_Int *i_face_element, HYPRE_Int *j_face_element, + HYPRE_Int *i_element_face, HYPRE_Int *j_element_face, + HYPRE_Int *i_face_to_prefer_weight, + HYPRE_Int *i_face_weight, + HYPRE_Int num_faces, HYPRE_Int num_elements, + HYPRE_Int *num_AEs_pointer) { + HYPRE_Int ierr = 0; + HYPRE_Int i, j, k, l; - HYPRE_Int ierr = 0; - HYPRE_Int i, j, k, l; + HYPRE_Int face_to_eliminate; + HYPRE_Int max_weight_old, max_weight; - HYPRE_Int face_to_eliminate; - HYPRE_Int max_weight_old, max_weight; + HYPRE_Int AE_counter=0, AE_element_counter=0; - HYPRE_Int AE_counter=0, AE_element_counter=0; + /* HYPRE_Int i_element_face_counter; */ - /* HYPRE_Int i_element_face_counter; */ + HYPRE_Int *i_element_to_AE; - HYPRE_Int *i_element_to_AE; + HYPRE_Int *previous, *next, *first; + HYPRE_Int head, tail, last; - HYPRE_Int *previous, *next, *first; - HYPRE_Int head, tail, last; + HYPRE_Int face_max_weight, face_local_max_weight, preferred_weight; - HYPRE_Int face_max_weight, face_local_max_weight, preferred_weight; + HYPRE_Int weight, weight_max; - HYPRE_Int weight, weight_max; - - max_weight = 1; - for (i=0; i < num_faces; i++) - { + max_weight = 1; + for (i=0; i < num_faces; i++) + { weight = 1; for (j=i_face_face[i]; j < i_face_face[i+1]; j++) - weight+= w_face_face[j]; + weight+= w_face_face[j]; if (max_weight < weight) max_weight = weight; - } - - first = hypre_CTAlloc(HYPRE_Int, max_weight+1); - - - - next = hypre_CTAlloc(HYPRE_Int, num_faces); + } + first = hypre_CTAlloc(HYPRE_Int, max_weight+1); - previous = hypre_CTAlloc(HYPRE_Int, num_faces+1); + next = hypre_CTAlloc(HYPRE_Int, num_faces); + previous = hypre_CTAlloc(HYPRE_Int, num_faces+1); - tail = num_faces; - head = -1; + tail = num_faces; + head = -1; - for (i=0; i < num_faces; i++) - { + for (i=0; i < num_faces; i++) + { next[i] = i+1; previous[i] = i-1; - } - - last = num_faces-1; - previous[tail] = last; - - for (weight=1; weight <= max_weight; weight++) - first[weight] = tail; + } - i_element_to_AE = hypre_CTAlloc(HYPRE_Int, num_elements); + last = num_faces-1; + previous[tail] = last; - /*======================================================================= - AGGLOMERATION PROCEDURE: - ======================================================================= */ + for (weight=1; weight <= max_weight; weight++) + first[weight] = tail; - for (k=0; k < num_elements; k++) - i_element_to_AE[k] = -1; + i_element_to_AE = hypre_CTAlloc(HYPRE_Int, num_elements); - for (k=0; k < num_faces; k++) - i_face_weight[k] = 1; + /*======================================================================= + AGGLOMERATION PROCEDURE: + ======================================================================= */ + for (k=0; k < num_elements; k++) + i_element_to_AE[k] = -1; - first[0] = 0; - first[1] = 0; + for (k=0; k < num_faces; k++) + i_face_weight[k] = 1; - last = previous[tail]; - weight_max = i_face_weight[last]; + first[0] = 0; + first[1] = 0; + last = previous[tail]; + weight_max = i_face_weight[last]; - k = last; - face_max_weight = -1; - while (k!= head) - { + k = last; + face_max_weight = -1; + while (k!= head) + { if (i_face_to_prefer_weight[k] > -1) - face_max_weight = k; - + face_max_weight = k; + if (face_max_weight > -1) break; - - k=previous[k]; - } + k=previous[k]; + } - /* this will be used if the faces have been sorted: ***************** - k = last; - face_max_weight = -1; - while (k != head) - { + /* this will be used if the faces have been sorted: ***************** + k = last; + face_max_weight = -1; + while (k != head) + { if (i_face_to_prefer_weight[k] > -1) - face_max_weight = k; - + face_max_weight = k; - if (face_max_weight > -1) - { - max_weight = i_face_weight[face_max_weight]; - l = face_max_weight; - - while (previous[l] != head) - { - - if (i_face_weight[previous[l]] < max_weight) - break; - else - if (i_face_to_prefer_weight[previous[l]] > - i_face_to_prefer_weight[face_max_weight]) - { - l = previous[l]; - face_max_weight = l; - } - else - l = previous[l]; - } - break; - } + if (face_max_weight > -1) + { + max_weight = i_face_weight[face_max_weight]; + l = face_max_weight; + while (previous[l] != head) + { - l =previous[k]; + if (i_face_weight[previous[l]] < max_weight) + break; + else + if (i_face_to_prefer_weight[previous[l]] > + i_face_to_prefer_weight[face_max_weight]) + { + l = previous[l]; + face_max_weight = l; + } + else + l = previous[l]; + } + break; + } + l =previous[k]; weight = i_face_weight[k]; last = previous[tail]; - if (last == head) - weight_max = 0; + if (last == head) + weight_max = 0; else - weight_max = i_face_weight[last]; - - - ierr = hypre_remove_entry(weight, &weight_max, - previous, next, first, &last, - head, tail, - k); - - - + weight_max = i_face_weight[last]; + ierr = hypre_remove_entry(weight, &weight_max, + previous, next, first, &last, + head, tail, + k); k=l; - } - */ + } + */ - if (face_max_weight == -1) - { + if (face_max_weight == -1) + { hypre_error_w_msg(HYPRE_ERROR_GENERIC,"all faces are unacceptable, i.e., no faces to eliminate !\n"); *num_AEs_pointer = 1; i_AE_element[0] = 0; for (i=0; i < num_elements; i++) - { - i_element_to_AE[i] = 0; - j_AE_element[i] = i; - } + { + i_element_to_AE[i] = 0; + j_AE_element[i] = i; + } i_AE_element[1] = num_elements; return hypre_error_flag; - } - - for (k=0; k < num_faces; k++) - if (i_face_to_prefer_weight[k] > i_face_to_prefer_weight[face_max_weight]) - face_max_weight = k; - - max_weight = i_face_weight[face_max_weight]; + } - AE_counter=0; - AE_element_counter=0; - + for (k=0; k < num_faces; k++) + if (i_face_to_prefer_weight[k] > i_face_to_prefer_weight[face_max_weight]) + face_max_weight = k; - i_AE_element[AE_counter] = AE_element_counter; + max_weight = i_face_weight[face_max_weight]; - max_weight_old = -1; + AE_counter=0; + AE_element_counter=0; - face_local_max_weight = face_max_weight; + i_AE_element[AE_counter] = AE_element_counter; -eliminate_face: + max_weight_old = -1; - face_to_eliminate = face_local_max_weight; + face_local_max_weight = face_max_weight; - max_weight = i_face_weight[face_to_eliminate]; + eliminate_face: - last = previous[tail]; - if (last == head) - weight_max = 0; - else - weight_max = i_face_weight[last]; + face_to_eliminate = face_local_max_weight; - - ierr = hypre_remove_entry(max_weight, &weight_max, - previous, next, first, &last, - head, tail, - face_to_eliminate); + max_weight = i_face_weight[face_to_eliminate]; - i_face_weight[face_to_eliminate] = 0; + last = previous[tail]; + if (last == head) + weight_max = 0; + else + weight_max = i_face_weight[last]; - /*---------------------------------------------------------- - * agglomeration step: - * - * put on AE_element -- list all elements - * that share face "face_to_eliminate"; - *----------------------------------------------------------*/ + ierr = hypre_remove_entry(max_weight, &weight_max, + previous, next, first, &last, + head, tail, + face_to_eliminate); + + i_face_weight[face_to_eliminate] = 0; + + /*---------------------------------------------------------- + * agglomeration step: + * + * put on AE_element -- list all elements + * that share face "face_to_eliminate"; + *----------------------------------------------------------*/ - for (k = i_face_element[face_to_eliminate]; - k < i_face_element[face_to_eliminate+1]; k++) - { + for (k = i_face_element[face_to_eliminate]; + k < i_face_element[face_to_eliminate+1]; k++) + { /* check if element j_face_element[k] is already on the list: */ if (j_face_element[k] < num_elements) - { - if (i_element_to_AE[j_face_element[k]] == -1) - { - j_AE_element[AE_element_counter] = j_face_element[k]; - i_element_to_AE[j_face_element[k]] = AE_counter; - AE_element_counter++; - } - } - } - + { + if (i_element_to_AE[j_face_element[k]] == -1) + { + j_AE_element[AE_element_counter] = j_face_element[k]; + i_element_to_AE[j_face_element[k]] = AE_counter; + AE_element_counter++; + } + } + } - /* local update & search:==================================== */ + /* local update & search:==================================== */ - for (j=i_face_face[face_to_eliminate]; - j 0) + for (j=i_face_face[face_to_eliminate]; + j 0) { - weight = i_face_weight[j_face_face[j]]; + weight = i_face_weight[j_face_face[j]]; - last = previous[tail]; - if (last == head) - weight_max = 0; - else - weight_max = i_face_weight[last]; + last = previous[tail]; + if (last == head) + weight_max = 0; + else + weight_max = i_face_weight[last]; + + ierr = hypre_move_entry(weight, &weight_max, + previous, next, first, &last, + head, tail, + j_face_face[j]); + + i_face_weight[j_face_face[j]]+=w_face_face[j]; + + weight = i_face_weight[j_face_face[j]]; + + /* hypre_printf("update entry: %d\n", j_face_face[j]); */ + + last = previous[tail]; + if (last == head) + weight_max = 0; + else + weight_max = i_face_weight[last]; + + ierr = hypre_update_entry(weight, &weight_max, + previous, next, first, &last, + head, tail, + j_face_face[j]); + + last = previous[tail]; + if (last == head) + weight_max = 0; + else + weight_max = i_face_weight[last]; + } - ierr = hypre_move_entry(weight, &weight_max, - previous, next, first, &last, - head, tail, - j_face_face[j]); + /* find a face of the elements that have already been agglomerated + with a maximal weight: ====================================== */ - i_face_weight[j_face_face[j]]+=w_face_face[j]; + max_weight_old = max_weight; - weight = i_face_weight[j_face_face[j]]; + face_local_max_weight = -1; + preferred_weight = -1; + + for (l = i_AE_element[AE_counter]; + l < AE_element_counter; l++) + { + for (j=i_element_face[j_AE_element[l]]; + j 1 && i_face_weight[i] > 0 && + i_face_to_prefer_weight[i] > -1) + { + if ( max_weight < i_face_weight[i]) + { + face_local_max_weight = i; + max_weight = i_face_weight[i]; + preferred_weight = i_face_to_prefer_weight[i]; + } - last = previous[tail]; - if (last == head) - weight_max = 0; - else - weight_max = i_face_weight[last]; + if ( max_weight == i_face_weight[i] + && i_face_to_prefer_weight[i] > preferred_weight) + { + face_local_max_weight = i; + preferred_weight = i_face_to_prefer_weight[i]; + } + } + } + } - ierr = hypre_update_entry(weight, &weight_max, - previous, next, first, &last, - head, tail, - j_face_face[j]); + if (face_local_max_weight > -1) goto eliminate_face; - last = previous[tail]; - if (last == head) - weight_max = 0; - else - weight_max = i_face_weight[last]; - - } + /* ---------------------------------------------------------------- + * eliminate and label with i_face_weight[ ] = -1 + * "boundary faces of agglomerated elements"; + * those faces will be preferred for the next coarse spaces + * in case multiple coarse spaces are to be built; + * ---------------------------------------------------------------*/ - /* find a face of the elements that have already been agglomerated - with a maximal weight: ====================================== */ - - max_weight_old = max_weight; + for (k = i_AE_element[AE_counter]; k < AE_element_counter; k++) + { + for (j = i_element_face[j_AE_element[k]]; + j < i_element_face[j_AE_element[k]+1]; j++) + { + if (i_face_weight[j_element_face[j]] > 0) + { + weight = i_face_weight[j_element_face[j]]; + last = previous[tail]; + if (last == head) + weight_max = 0; + else + weight_max = i_face_weight[last]; - face_local_max_weight = -1; - preferred_weight = -1; - for (l = i_AE_element[AE_counter]; - l < AE_element_counter; l++) - { - for (j=i_element_face[j_AE_element[l]]; - j 1 && i_face_weight[i] > 0 && - i_face_to_prefer_weight[i] > -1) - { - if ( max_weight < i_face_weight[i]) - { - face_local_max_weight = i; - max_weight = i_face_weight[i]; - preferred_weight = i_face_to_prefer_weight[i]; - } - - if ( max_weight == i_face_weight[i] - && i_face_to_prefer_weight[i] > preferred_weight) - { - face_local_max_weight = i; - preferred_weight = i_face_to_prefer_weight[i]; - } - - } - } - } - - if (face_local_max_weight > -1) goto eliminate_face; - - /* ---------------------------------------------------------------- - * eliminate and label with i_face_weight[ ] = -1 - * "boundary faces of agglomerated elements"; - * those faces will be preferred for the next coarse spaces - * in case multiple coarse spaces are to be built; - * ---------------------------------------------------------------*/ + ierr = hypre_remove_entry(weight, &weight_max, + previous, next, first, &last, + head, tail, + j_element_face[j]); - for (k = i_AE_element[AE_counter]; k < AE_element_counter; k++) - { - for (j = i_element_face[j_AE_element[k]]; - j < i_element_face[j_AE_element[k]+1]; j++) - { - if (i_face_weight[j_element_face[j]] > 0) - { - weight = i_face_weight[j_element_face[j]]; - last = previous[tail]; - if (last == head) - weight_max = 0; - else - weight_max = i_face_weight[last]; - - - ierr = hypre_remove_entry(weight, &weight_max, - previous, next, first, &last, - head, tail, - j_element_face[j]); + i_face_weight[j_element_face[j]] = -1; - i_face_weight[j_element_face[j]] = -1; + } + } + } - } - } - } - - if (AE_element_counter > i_AE_element[AE_counter]) - { - /* hypre_printf("completing agglomerated element: %d\n", - AE_counter); */ + if (AE_element_counter > i_AE_element[AE_counter]) + { + /* hypre_printf("completing agglomerated element: %d\n", + AE_counter); */ AE_counter++; - } - - i_AE_element[AE_counter] = AE_element_counter; - + } - /* find a face with maximal weight: ---------------------------*/ + i_AE_element[AE_counter] = AE_element_counter; + /* find a face with maximal weight: ---------------------------*/ - last = previous[tail]; - if (last == head) goto end_agglomerate; + last = previous[tail]; + if (last == head) goto end_agglomerate; - weight_max = i_face_weight[last]; + weight_max = i_face_weight[last]; - - /* hypre_printf("global search: ======================================\n"); */ + /* hypre_printf("global search: ======================================\n"); */ - face_max_weight = -1; + face_max_weight = -1; - k = last; - while (k != head) - { + k = last; + while (k != head) + { if (i_face_to_prefer_weight[k] > -1) - face_max_weight = k; + face_max_weight = k; + if (face_max_weight > -1) + { + max_weight = i_face_weight[face_max_weight]; + l = face_max_weight; - if (face_max_weight > -1) - { - max_weight = i_face_weight[face_max_weight]; - l = face_max_weight; - - while (previous[l] != head) - { - - if (i_face_weight[previous[l]] < max_weight) - break; - else - if (i_face_to_prefer_weight[previous[l]] > - i_face_to_prefer_weight[face_max_weight]) - { - l = previous[l]; - face_max_weight = l; - } - else - l = previous[l]; - } + while (previous[l] != head) + { - break; - } + if (i_face_weight[previous[l]] < max_weight) + break; + else + if (i_face_to_prefer_weight[previous[l]] > + i_face_to_prefer_weight[face_max_weight]) + { + l = previous[l]; + face_max_weight = l; + } + else + l = previous[l]; + } + break; + } l =previous[k]; /* remove face k: ---------------------------------------*/ - weight = i_face_weight[k]; last = previous[tail]; - if (last == head) - weight_max = 0; + if (last == head) + weight_max = 0; else - weight_max = i_face_weight[last]; + weight_max = i_face_weight[last]; - ierr = hypre_remove_entry(weight, &weight_max, - previous, next, first, &last, - head, tail, - k); + ierr = hypre_remove_entry(weight, &weight_max, + previous, next, first, &last, + head, tail, + k); - /* i_face_weight[k] = -1; */ - k=l; - } - - if (face_max_weight == -1) goto end_agglomerate; + } - max_weight = i_face_weight[face_max_weight]; + if (face_max_weight == -1) goto end_agglomerate; - face_local_max_weight = face_max_weight; + max_weight = i_face_weight[face_max_weight]; - goto eliminate_face; + face_local_max_weight = face_max_weight; -end_agglomerate: + goto eliminate_face; + end_agglomerate: - /* eliminate isolated elements: ----------------------------------*/ + /* eliminate isolated elements: ----------------------------------*/ - for (i=0; i -1) - for (k=i_face_element[j_element_face[j]]; - k -1) + for (k=i_face_element[j_element_face[j]]; + k -1) - i_element_face_counter++; - - if (i_element_face_counter == 1) - { - for (j=i_element_face[i]; j < i_element_face[i+1]; j++) - if (i_face_to_prefer_weight[j_element_face[j]] > -1) - for (k=i_face_element[j_element_face[j]]; - k -1) + i_element_face_counter++; + + if (i_element_face_counter == 1) + { + for (j=i_element_face[i]; j < i_element_face[i+1]; j++) + if (i_face_to_prefer_weight[j_element_face[j]] > -1) + for (k=i_face_element[j_element_face[j]]; + k -1; i--) - i_AE_element[i] = i_AE_element[i+1] - i_AE_element[i]; + for (i=AE_counter-1; i > -1; i--) + i_AE_element[i] = i_AE_element[i+1] - i_AE_element[i]; - for (i=0; i < num_elements; i++) - { + for (i=0; i < num_elements; i++) + { j_AE_element[i_AE_element[i_element_to_AE[i]]] = i; i_AE_element[i_element_to_AE[i]]++; - } + } - for (i=AE_counter-1; i > -1; i--) - i_AE_element[i+1] = i_AE_element[i]; + for (i=AE_counter-1; i > -1; i--) + i_AE_element[i+1] = i_AE_element[i]; - i_AE_element[0] = 0; + i_AE_element[0] = 0; - /*--------------------------------------------------------------------*/ - for (i=0; i < num_faces; i++) - if (i_face_to_prefer_weight[i] == -1) i_face_weight[i] = -1; + /*--------------------------------------------------------------------*/ + for (i=0; i < num_faces; i++) + if (i_face_to_prefer_weight[i] == -1) i_face_weight[i] = -1; - hypre_TFree(i_element_to_AE); + hypre_TFree(i_element_to_AE); - hypre_TFree(previous); - hypre_TFree(next); - hypre_TFree(first); + hypre_TFree(previous); + hypre_TFree(next); + hypre_TFree(first); - return ierr; + return ierr; } -HYPRE_Int hypre_update_entry(HYPRE_Int weight, HYPRE_Int *weight_max, - HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last, - HYPRE_Int head, HYPRE_Int tail, - HYPRE_Int i) +HYPRE_Int hypre_update_entry(HYPRE_Int weight, HYPRE_Int *weight_max, + HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last, + HYPRE_Int head, HYPRE_Int tail, + HYPRE_Int i) { - HYPRE_Int weight0; - - if (previous[i] != head) next[previous[i]] = next[i]; - previous[next[i]] = previous[i]; + HYPRE_Int weight0; + if (previous[i] != head) next[previous[i]] = next[i]; + previous[next[i]] = previous[i]; - if (first[weight] == tail) - { - if (weight <= weight_max[0]) - { - hypre_printf("ERROR IN UPDATE_ENTRY: ===================\n"); - hypre_printf("weight: %d, weight_max: %d\n", - weight, weight_max[0]); - return -1; - } + if (first[weight] == tail) + { + if (weight <= weight_max[0]) + { + hypre_printf("ERROR IN UPDATE_ENTRY: ===================\n"); + hypre_printf("weight: %d, weight_max: %d\n", + weight, weight_max[0]); + return -1; + } for (weight0=weight_max[0]+1; weight0 <= weight; weight0++) - { - first[weight0] = i; - /* hypre_printf("create first[%d] = %d\n", weight0, i); */ - } - - previous[i] = previous[tail]; - next[i] = tail; - if (previous[tail] > head) - next[previous[tail]] = i; - previous[tail] = i; + { + first[weight0] = i; + /* hypre_printf("create first[%d] = %d\n", weight0, i); */ + } - } - else - /* first[weight] already exists: =====================*/ - { + previous[i] = previous[tail]; + next[i] = tail; + if (previous[tail] > head) + next[previous[tail]] = i; + previous[tail] = i; + + } + else + /* first[weight] already exists: =====================*/ + { previous[i] = previous[first[weight]]; next[i] = first[weight]; - + if (previous[first[weight]] != head) - next[previous[first[weight]]] = i; + next[previous[first[weight]]] = i; previous[first[weight]] = i; for (weight0=1; weight0 <= weight; weight0++) - if (first[weight0] == first[weight]) - first[weight0] = i; - - } + if (first[weight0] == first[weight]) + first[weight0] = i; + } - return 0; - + return 0; } -HYPRE_Int hypre_remove_entry(HYPRE_Int weight, HYPRE_Int *weight_max, - HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last, - HYPRE_Int head, HYPRE_Int tail, - HYPRE_Int i) +HYPRE_Int hypre_remove_entry(HYPRE_Int weight, HYPRE_Int *weight_max, + HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last, + HYPRE_Int head, HYPRE_Int tail, + HYPRE_Int i) { - HYPRE_Int weight0; + HYPRE_Int weight0; - if (previous[i] != head) next[previous[i]] = next[i]; - previous[next[i]] = previous[i]; + if (previous[i] != head) next[previous[i]] = next[i]; + previous[next[i]] = previous[i]; - for (weight0=1; weight0 <= weight_max[0]; weight0++) - { + for (weight0=1; weight0 <= weight_max[0]; weight0++) + { /* hypre_printf("first[%d}: %d\n", weight0, first[weight0]); */ if (first[weight0] == i) - { - first[weight0] = next[i]; - /* hypre_printf("shift: first[%d]= %d to %d\n", - weight0, i, next[i]); - if (i == last[0]) - hypre_printf("i= last[0]: %d\n", i); */ - } - } - - next[i] = i; - previous[i] = i; + { + first[weight0] = next[i]; + /* hypre_printf("shift: first[%d]= %d to %d\n", + weight0, i, next[i]); + if (i == last[0]) + hypre_printf("i= last[0]: %d\n", i); */ + } + } - return 0; + next[i] = i; + previous[i] = i; + return 0; } -HYPRE_Int hypre_move_entry(HYPRE_Int weight, HYPRE_Int *weight_max, - HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last, - HYPRE_Int head, HYPRE_Int tail, - HYPRE_Int i) +HYPRE_Int hypre_move_entry(HYPRE_Int weight, HYPRE_Int *weight_max, + HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last, + HYPRE_Int head, HYPRE_Int tail, + HYPRE_Int i) { - HYPRE_Int weight0; + HYPRE_Int weight0; - if (previous[i] != head) next[previous[i]] = next[i]; - previous[next[i]] = previous[i]; + if (previous[i] != head) next[previous[i]] = next[i]; + previous[next[i]] = previous[i]; - for (weight0=1; weight0 <= weight_max[0]; weight0++) - { + for (weight0=1; weight0 <= weight_max[0]; weight0++) + { if (first[weight0] == i) - first[weight0] = next[i]; - } - - return 0; + first[weight0] = next[i]; + } + return 0; } - - /*--------------------------------------------------------------------- - hypre_matinv: X <-- A**(-1) ; A IS POSITIVE DEFINITE (non--symmetric); - ---------------------------------------------------------------------*/ - + hypre_matinv: X <-- A**(-1) ; A IS POSITIVE DEFINITE (non--symmetric); + ---------------------------------------------------------------------*/ + HYPRE_Int hypre_matinv(HYPRE_Real *x, HYPRE_Real *a, HYPRE_Int k) { - HYPRE_Int i,j,l, ierr =0; - - + HYPRE_Int i,j,l, ierr =0; - for (i=0; i < k; i++) - { + for (i=0; i < k; i++) + { if (a[i+i*k] <= 0.e0) - { - if (i < k-1) - { - /********* - hypre_printf("indefinite singular matrix in *** matinv ***:\n"); - hypre_printf("i:%d; diagonal entry: %e\n", i, a[i+k*i]); - */ - ierr = -1; - } - - a[i+i*k] = 0.e0; - } - else - a[i+k*i] = 1.0 / a[i+i*k]; - + { + if (i < k-1) + { + /********* + hypre_printf("indefinite singular matrix in *** matinv ***:\n"); + hypre_printf("i:%d; diagonal entry: %e\n", i, a[i+k*i]); + */ + ierr = -1; + } + a[i+i*k] = 0.e0; + } + else + a[i+k*i] = 1.0 / a[i+i*k]; for (j=1; j < k-i; j++) - { - for (l=1; l < k-i; l++) - { - a[i+l+k*(i+j)] -= a[i+l+k*i] * a[i+k*i] * a[i+k*(i+j)]; - } - } - - for (j=1; j < k-i; j++) - { - a[i+j+k*i] = a[i+j+k*i] * a[i+k*i]; - a[i+k*(i+j)] = a[i+k*(i+j)] * a[i+k*i]; - } - } - - - - /* FULL INVERSION: --------------------------------------------*/ - - - + { + for (l=1; l < k-i; l++) + { + a[i+l+k*(i+j)] -= a[i+l+k*i] * a[i+k*i] * a[i+k*(i+j)]; + } + } - x[k*k-1] = a[k*k-1]; - for (i=k-1; i > -1; i--) - { for (j=1; j < k-i; j++) - { - x[i+j+k*i] =0; - x[i+k*(i+j)] =0; - - + { + a[i+j+k*i] = a[i+j+k*i] * a[i+k*i]; + a[i+k*(i+j)] = a[i+k*(i+j)] * a[i+k*i]; + } + } - for (l=1; l< k-i; l++) - { - x[i+j+k*i] -= x[i+j+k*(i+l)] * a[i+l+k*i]; - x[i+k*(i+j)] -= a[i+k*(i+l)] * x[i+l+k*(i+j)]; - } - } + /* FULL INVERSION: --------------------------------------------*/ + x[k*k-1] = a[k*k-1]; + for (i=k-1; i > -1; i--) + { + for (j=1; j < k-i; j++) + { + x[i+j+k*i] =0; + x[i+k*(i+j)] =0; + for (l=1; l< k-i; l++) + { + x[i+j+k*i] -= x[i+j+k*(i+l)] * a[i+l+k*i]; + x[i+k*(i+j)] -= a[i+k*(i+l)] * x[i+l+k*(i+j)]; + } + } x[i+k*i] = a[i+k*i]; for (j=1; j 1) - { + for (i=0; i < num_cols_offd; i++) + i_proc[i] = 0; - hypre_MatvecCommPkgCreate(A); + if (comm_pkg) + { + num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); + recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg); + } + else if (num_procs > 1) + { - comm_pkg = hypre_ParCSRMatrixCommPkg(A); - num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); - recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg); - } + hypre_MatvecCommPkgCreate(A); - for (i=0; i < num_recvs; i++) - for (indx=recv_vec_starts[i]; indx < recv_vec_starts[i+1]; indx++) - i_proc[indx] = i; + comm_pkg = hypre_ParCSRMatrixCommPkg(A); + num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); + recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg); + } - /* make domains from aggregates: *********************************/ + for (i=0; i < num_recvs; i++) + for (indx=recv_vec_starts[i]; indx < recv_vec_starts[i+1]; indx++) + i_proc[indx] = i; - i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables); + /* make domains from aggregates: *********************************/ - i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd); + i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables); - for (i=0; i < num_variables; i++) - i_dof_index[i] = -1; + i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd); - for (i=0; i < num_cols_offd; i++) - i_dof_index_offd[i] = -1; + for (i=0; i < num_variables; i++) + i_dof_index[i] = -1; - domain_dof_counter=0; - for (i=0; i < num_domains; i++) - { - i_domain_dof[i] = domain_dof_counter; - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - { - i_dof_index[j_aggregate_dof[j]]=-1; - } - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - { - for (k=a_diag_i[j_aggregate_dof[j]]; - k= i - && i_dof_index[a_diag_j[k]]==-1) - { - i_dof_index[a_diag_j[k]]++; - domain_dof_counter++; - } - for (k=a_offd_i[j_aggregate_dof[j]]; - k my_id - && i_dof_index_offd[a_offd_j[k]]==-1) - { - i_dof_index_offd[a_offd_j[k]]++; - domain_dof_counter++; - } - } - } + for (i=0; i < num_cols_offd; i++) + i_dof_index_offd[i] = -1; - for (i=0; i < num_variables; i++) - i_dof_index[i] = -1; + domain_dof_counter=0; + for (i=0; i < num_domains; i++) + { + i_domain_dof[i] = domain_dof_counter; + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + { + i_dof_index[j_aggregate_dof[j]]=-1; + } + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + { + for (k=a_diag_i[j_aggregate_dof[j]]; + k= i + && i_dof_index[a_diag_j[k]]==-1) + { + i_dof_index[a_diag_j[k]]++; + domain_dof_counter++; + } + for (k=a_offd_i[j_aggregate_dof[j]]; + k my_id + && i_dof_index_offd[a_offd_j[k]]==-1) + { + i_dof_index_offd[a_offd_j[k]]++; + domain_dof_counter++; + } + } + } - for (i=0; i < num_cols_offd; i++) - i_dof_index_offd[i] = -1; + for (i=0; i < num_variables; i++) + i_dof_index[i] = -1; - i_domain_dof[num_domains] = domain_dof_counter; - j_domain_dof = hypre_CTAlloc (HYPRE_Int, domain_dof_counter); + for (i=0; i < num_cols_offd; i++) + i_dof_index_offd[i] = -1; - domain_dof_counter=0; - for (i=0; i < num_domains; i++) - { - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - { - i_dof_index[j_aggregate_dof[j]]=-1; - } - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - { - for (k=a_diag_i[j_aggregate_dof[j]]; - k= i - && i_dof_index[a_diag_j[k]]==-1) - { - i_dof_index[a_diag_j[k]]++; - j_domain_dof[domain_dof_counter] = a_diag_j[k]; - domain_dof_counter++; - } - for (k=a_offd_i[j_aggregate_dof[j]]; - k my_id - && i_dof_index_offd[a_offd_j[k]]==-1) - { - i_dof_index_offd[a_offd_j[k]]++; - j_domain_dof[domain_dof_counter] = a_offd_j[k]+num_variables; - domain_dof_counter++; - } - } - } + i_domain_dof[num_domains] = domain_dof_counter; + j_domain_dof = hypre_CTAlloc (HYPRE_Int, domain_dof_counter); - hypre_TFree(i_aggregate_dof); - hypre_TFree(j_aggregate_dof); - hypre_TFree(i_dof_to_aggregate); - hypre_TFree(i_dof_index); - hypre_TFree(i_dof_index_offd); - hypre_TFree(i_proc); - } - else if (overlap == 2) - { - i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1); + domain_dof_counter=0; + for (i=0; i < num_domains; i++) + { + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + { + i_dof_index[j_aggregate_dof[j]]=-1; + } + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + { + for (k=a_diag_i[j_aggregate_dof[j]]; + k= i + && i_dof_index[a_diag_j[k]]==-1) + { + i_dof_index[a_diag_j[k]]++; + j_domain_dof[domain_dof_counter] = a_diag_j[k]; + domain_dof_counter++; + } + for (k=a_offd_i[j_aggregate_dof[j]]; + k my_id + && i_dof_index_offd[a_offd_j[k]]==-1) + { + i_dof_index_offd[a_offd_j[k]]++; + j_domain_dof[domain_dof_counter] = a_offd_j[k]+num_variables; + domain_dof_counter++; + } + } + } - i_dof_to_aggregate = hypre_CTAlloc (HYPRE_Int, num_variables); - for (i=0; i < num_domains; i++) - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - i_dof_to_aggregate[j_aggregate_dof[j]] = i; + hypre_TFree(i_aggregate_dof); + hypre_TFree(j_aggregate_dof); + hypre_TFree(i_dof_to_aggregate); + hypre_TFree(i_dof_index); + hypre_TFree(i_dof_index_offd); + hypre_TFree(i_proc); + } + else if (overlap == 2) + { + i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1); - /* make domains from aggregates: *********************************/ + i_dof_to_aggregate = hypre_CTAlloc (HYPRE_Int, num_variables); + for (i=0; i < num_domains; i++) + for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) + i_dof_to_aggregate[j_aggregate_dof[j]] = i; - i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables); + /* make domains from aggregates: *********************************/ - i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd); + i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables); - for (i=0; i < num_variables; i++) - i_dof_index[i] = -1; + i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd); - for (i=0; i < num_cols_offd; i++) - i_dof_index_offd[i] = -1; + for (i=0; i < num_variables; i++) + i_dof_index[i] = -1; - domain_dof_counter=0; - for (i=0; i < num_domains; i++) - { - i_domain_dof[i] = domain_dof_counter; - for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++) - { - for (k=a_diag_i[j_aggregate_dof[j]]; - k max_local_dof_counter) - max_local_dof_counter = local_dof_counter; - } - - domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter); - if (use_nonsymm) - piv = hypre_CTAlloc(HYPRE_Int, piv_counter); + max_local_dof_counter = local_dof_counter; + } -/* OLD- HYPRE_USING_ESSL - AE = hypre_CTAlloc(HYPRE_Real, max_local_dof_counter*max_local_dof_counter); -*/ + domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter); + if (use_nonsymm) + piv = hypre_CTAlloc(HYPRE_Int, piv_counter); - if (num_procs > 1) - { - A_ext = hypre_ParCSRMatrixExtractBExt(A,A,1); - a_ext_i = hypre_CSRMatrixI(A_ext); - a_ext_j = hypre_CSRMatrixJ(A_ext); - a_ext_data = hypre_CSRMatrixData(A_ext); - } - else - A_ext = NULL; + if (num_procs > 1) + { + A_ext = hypre_ParCSRMatrixExtractBExt(A,A,1); + a_ext_i = hypre_CSRMatrixI(A_ext); + a_ext_j = hypre_CSRMatrixJ(A_ext); + a_ext_data = hypre_CSRMatrixData(A_ext); + } + else + A_ext = NULL; - i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter); + i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter); - i_global_to_local = hypre_CTAlloc(HYPRE_Int, num_variables+num_cols_offd); + i_global_to_local = hypre_CTAlloc(HYPRE_Int, num_variables+num_cols_offd); - for (i=0; i < num_variables+num_cols_offd; i++) - i_global_to_local[i] = -1; + for (i=0; i < num_variables+num_cols_offd; i++) + i_global_to_local[i] = -1; - piv_counter = 0; - domain_matrixinverse_counter = 0; - for (i=0; i < num_domains; i++) - { + piv_counter = 0; + domain_matrixinverse_counter = 0; + for (i=0; i < num_domains; i++) + { local_dof_counter = 0; for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) - { - i_global_to_local[j_domain_dof[j]] = local_dof_counter; - i_local_to_global[local_dof_counter] = j_domain_dof[j]; - local_dof_counter++; - } + { + i_global_to_local[j_domain_dof[j]] = local_dof_counter; + i_local_to_global[local_dof_counter] = j_domain_dof[j]; + local_dof_counter++; + } /* get local matrix in AE: ======================================== */ AE = &domain_matrixinverse[domain_matrixinverse_counter]; ipiv = &piv[piv_counter]; - cnt = 0; + cnt = 0; for (i_loc=0; i_loc < local_dof_counter; i_loc++) - for (j_loc=0; j_loc < local_dof_counter; j_loc++) - AE[cnt++] = 0.e0; + for (j_loc=0; j_loc < local_dof_counter; j_loc++) + AE[cnt++] = 0.e0; for (i_loc=0; i_loc < local_dof_counter; i_loc++) - { - i_dof = i_local_to_global[i_loc]; - if (i_dof < num_variables) - { - for (j=a_diag_i[i_dof]; j < a_diag_i[i_dof+1]; j++) - { - j_loc = i_global_to_local[a_diag_j[j]]; - if (j_loc >=0) - AE[i_loc + j_loc * local_dof_counter] = a_diag_data[j]; - } - for (j=a_offd_i[i_dof]; j < a_offd_i[i_dof+1]; j++) - { - j_loc = i_global_to_local[a_offd_j[j]+num_variables]; - if (j_loc >=0) - AE[i_loc + j_loc * local_dof_counter] = a_offd_data[j]; - } - } - else - { - i_dof -= num_variables; - for (j=a_ext_i[i_dof]; j < a_ext_i[i_dof+1]; j++) - { - jj = a_ext_j[j]; - if (jj > col_0 && jj < col_n) - { - jj = jj - first_col_diag; - } - else - { - jj = hypre_BinarySearch(col_map_offd,jj,num_cols_offd); - if (jj > -1) jj += num_variables; - } - if (jj > -1) - { - j_loc = i_global_to_local[jj]; - if (j_loc >=0) - AE[i_loc + j_loc * local_dof_counter] = a_ext_data[j]; - } - } - } - } - -/* OLD- HYPRE_USING_ESSL - cnt = local_dof_counter; - for (j_loc=1; j_loc < local_dof_counter; j_loc++) - for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++) - AE[cnt++] = AE[i_loc + j_loc * local_dof_counter]; - dppf(AE, local_dof_counter, 1); - size = local_dof_counter*(local_dof_counter+1)/2; - cnt = domain_matrixinverse_counter; - for (i_loc = 0; i_loc < size; i_loc++) - domain_matrixinverse[cnt++] = AE[i_loc]; - domain_matrixinverse_counter += size; -*/ - + { + i_dof = i_local_to_global[i_loc]; + if (i_dof < num_variables) + { + for (j=a_diag_i[i_dof]; j < a_diag_i[i_dof+1]; j++) + { + j_loc = i_global_to_local[a_diag_j[j]]; + if (j_loc >=0) + AE[i_loc + j_loc * local_dof_counter] = a_diag_data[j]; + } + for (j=a_offd_i[i_dof]; j < a_offd_i[i_dof+1]; j++) + { + j_loc = i_global_to_local[a_offd_j[j]+num_variables]; + if (j_loc >=0) + AE[i_loc + j_loc * local_dof_counter] = a_offd_data[j]; + } + } + else + { + i_dof -= num_variables; + for (j=a_ext_i[i_dof]; j < a_ext_i[i_dof+1]; j++) + { + jj = a_ext_j[j]; + if (jj > col_0 && jj < col_n) + { + jj = jj - first_col_diag; + } + else + { + jj = hypre_BinarySearch(col_map_offd,jj,num_cols_offd); + if (jj > -1) jj += num_variables; + } + if (jj > -1) + { + j_loc = i_global_to_local[jj]; + if (j_loc >=0) + AE[i_loc + j_loc * local_dof_counter] = a_ext_data[j]; + } + } + } + } + if (use_nonsymm) { -#ifdef HYPRE_USING_ESSL - dgetrf(local_dof_counter,local_dof_counter, AE, - local_dof_counter, ipiv, &ierr); -#else - hypre_F90_NAME_LAPACK(dgetrf,DGETRF)(&local_dof_counter, - &local_dof_counter, AE, - &local_dof_counter, ipiv, &ierr); -#endif + hypre_dgetrf(&local_dof_counter, + &local_dof_counter, AE, + &local_dof_counter, ipiv, &ierr); piv_counter +=local_dof_counter; } - + else { -#ifdef HYPRE_USING_ESSL - dpotrf(&uplo,local_dof_counter, AE, - local_dof_counter, &ierr); -#else - hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo,&local_dof_counter, AE, - &local_dof_counter, &ierr); -#endif - + hypre_dpotrf(&uplo,&local_dof_counter, AE, + &local_dof_counter, &ierr); } - + domain_matrixinverse_counter+=local_dof_counter*local_dof_counter; for (l_loc=0; l_loc < local_dof_counter; l_loc++) - i_global_to_local[i_local_to_global[l_loc]] = -1; - - } + i_global_to_local[i_local_to_global[l_loc]] = -1; - hypre_TFree(i_local_to_global); - hypre_TFree(i_global_to_local); - hypre_CSRMatrixDestroy(A_ext); - -/* OLD -HYPRE_USING_ESSL - hypre_TFree(AE); -*/ - - domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, - i_domain_dof[num_domains]); - - hypre_CSRMatrixI(domain_structure) = i_domain_dof; - hypre_CSRMatrixJ(domain_structure) = j_domain_dof; - hypre_CSRMatrixData(domain_structure) = domain_matrixinverse; - - *domain_structure_pointer = domain_structure; - *piv_pointer = piv; - return hypre_error_flag; + } + + hypre_TFree(i_local_to_global); + hypre_TFree(i_global_to_local); + hypre_CSRMatrixDestroy(A_ext); + + domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, + i_domain_dof[num_domains]); + + hypre_CSRMatrixI(domain_structure) = i_domain_dof; + hypre_CSRMatrixJ(domain_structure) = j_domain_dof; + hypre_CSRMatrixData(domain_structure) = domain_matrixinverse; + *domain_structure_pointer = domain_structure; + *piv_pointer = piv; + return hypre_error_flag; } HYPRE_Int hypre_ParGenerateScale(hypre_ParCSRMatrix *A, - hypre_CSRMatrix *domain_structure, - HYPRE_Real relaxation_weight, - HYPRE_Real **scale_pointer) + hypre_CSRMatrix *domain_structure, + HYPRE_Real relaxation_weight, + HYPRE_Real **scale_pointer) { HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); @@ -4034,7 +3484,7 @@ HYPRE_Int num_variables = hypre_ParCSRMatrixNumRows(A); HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A)); HYPRE_Int j_loc, index, start; - + hypre_ParCSRCommHandle *comm_handle; if (comm_pkg) @@ -4048,20 +3498,20 @@ if (num_cols_offd) scale_ext = hypre_CTAlloc(HYPRE_Real,num_cols_offd); for (i=0; i < num_domains; i++) - { + { for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) { - j_loc = j_domain_dof[j]; - if (j_loc < num_variables) - scale[j_loc] += 1.0; - else - scale_ext[j_loc-num_variables] += 1.0; + j_loc = j_domain_dof[j]; + if (j_loc < num_variables) + scale[j_loc] += 1.0; + else + scale_ext[j_loc-num_variables] += 1.0; } } if (comm_pkg) { scale_int = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]); - comm_handle = hypre_ParCSRCommHandleCreate (2,comm_pkg,scale_ext,scale_int); + comm_handle = hypre_ParCSRCommHandleCreate (2,comm_pkg,scale_ext,scale_int); hypre_ParCSRCommHandleDestroy(comm_handle); comm_handle = NULL; @@ -4072,7 +3522,7 @@ { start = send_map_starts[i]; for (j=start; j < send_map_starts[i+1]; j++) - scale[send_map_elmts[j]] += scale_int[index++]; + scale[send_map_elmts[j]] += scale_int[index++]; } if (comm_pkg) hypre_TFree(scale_int); @@ -4088,9 +3538,9 @@ HYPRE_Int hypre_ParGenerateHybridScale(hypre_ParCSRMatrix *A, - hypre_CSRMatrix *domain_structure, - hypre_CSRMatrix **A_boundary_pointer, - HYPRE_Real **scale_pointer) + hypre_CSRMatrix *domain_structure, + hypre_CSRMatrix **A_boundary_pointer, + HYPRE_Real **scale_pointer) { hypre_CSRMatrix *A_ext; HYPRE_Int *A_ext_i; @@ -4101,7 +3551,7 @@ HYPRE_Int *A_boundary_i; HYPRE_Int *A_boundary_j; HYPRE_Real *A_boundary_data; - + HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure); HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure); HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure); @@ -4121,7 +3571,7 @@ HYPRE_Int jj, j_loc, index, start; HYPRE_Int col_0, col_n; HYPRE_Int *col_map_offd = hypre_ParCSRMatrixColMapOffd(A); - + hypre_ParCSRCommHandle *comm_handle; col_0 = hypre_ParCSRMatrixFirstColDiag(A)-1; @@ -4137,7 +3587,7 @@ } scale = hypre_CTAlloc(HYPRE_Real, num_variables); - if (num_cols_offd) + if (num_cols_offd) { scale_ext = hypre_CTAlloc(HYPRE_Real,num_cols_offd); index_ext = hypre_CTAlloc(HYPRE_Int,num_cols_offd); @@ -4150,25 +3600,25 @@ index_ext[i] = -1; for (i=0; i < num_domains; i++) - { + { for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++) { - j_loc = j_domain_dof[j]; - if (j_loc >= num_variables) - { - j_loc -= num_variables; - if (index_ext[j_loc] == -1) - { - scale_ext[j_loc] += 1.0; - index_ext[j_loc] ++; - } - } + j_loc = j_domain_dof[j]; + if (j_loc >= num_variables) + { + j_loc -= num_variables; + if (index_ext[j_loc] == -1) + { + scale_ext[j_loc] += 1.0; + index_ext[j_loc] ++; + } + } } } if (comm_pkg) { scale_int = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]); - comm_handle=hypre_ParCSRCommHandleCreate(2,comm_pkg,scale_ext,scale_int); + comm_handle=hypre_ParCSRCommHandleCreate(2,comm_pkg,scale_ext,scale_int); hypre_ParCSRCommHandleDestroy(comm_handle); comm_handle = NULL; @@ -4177,34 +3627,34 @@ A_boundary_i = hypre_CTAlloc(HYPRE_Int,num_cols_offd+1); A_ext_j = hypre_CSRMatrixJ(A_ext); A_ext_data = hypre_CSRMatrixData(A_ext); - /* compress A_ext to contain only local data and + /* compress A_ext to contain only local data and necessary boundary points*/ index = 0; for (i=0; i < num_cols_offd; i++) { - A_boundary_i[i] = index; + A_boundary_i[i] = index; for (j = A_ext_i[i]; j < A_ext_i[i+1]; j++) { - j_col = A_ext_j[j]; - if (j_col > col_0 && j_col < col_n) - { - A_ext_j[j] = j_col-col_0; - index++; - } - else - { - jj = hypre_BinarySearch(col_map_offd,j_col,num_cols_offd); - if (jj > -1 && (scale_ext[jj] > 0)) - { - A_ext_j[j] = num_variables+jj; - index++; - } - else - { - A_ext_j[j] = -1; - } - } - } + j_col = A_ext_j[j]; + if (j_col > col_0 && j_col < col_n) + { + A_ext_j[j] = j_col-col_0; + index++; + } + else + { + jj = hypre_BinarySearch(col_map_offd,j_col,num_cols_offd); + if (jj > -1 && (scale_ext[jj] > 0)) + { + A_ext_j[j] = num_variables+jj; + index++; + } + else + { + A_ext_j[j] = -1; + } + } + } } A_boundary_i[num_cols_offd] = index; @@ -4220,12 +3670,12 @@ index = 0; for (i=0; i < A_ext_i[num_cols_offd]; i++) { - if (A_ext_j[i] > -1) - { - A_boundary_j[index] = A_ext_j[i]; - A_boundary_data[index] = A_ext_data[i]; - index++; - } + if (A_ext_j[i] > -1) + { + A_boundary_j[index] = A_ext_j[i]; + A_boundary_data[index] = A_ext_data[i]; + index++; + } } A_boundary = hypre_CSRMatrixCreate(num_cols_offd,num_variables, index); hypre_CSRMatrixI(A_boundary) = A_boundary_i; @@ -4239,11 +3689,11 @@ { start = send_map_starts[i]; for (j=start; j < send_map_starts[i+1]; j++) - scale[send_map_elmts[j]] += scale_int[index++]; + scale[send_map_elmts[j]] += scale_int[index++]; } if (comm_pkg) hypre_TFree(scale_int); - if (num_cols_offd) + if (num_cols_offd) { hypre_TFree(scale_ext); hypre_TFree(index_ext); diff -Nru hypre-2.11.2/src/parcsr_mv/_hypre_parcsr_mv.h hypre-2.13.0/src/parcsr_mv/_hypre_parcsr_mv.h --- hypre-2.11.2/src/parcsr_mv/_hypre_parcsr_mv.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_mv/_hypre_parcsr_mv.h 2017-10-20 17:42:22.000000000 +0000 @@ -831,6 +831,9 @@ HYPRE_Int hypre_FillResponseParToCSRMatrix ( void *p_recv_contact_buf , HYPRE_Int contact_size , HYPRE_Int contact_proc , void *ro , MPI_Comm comm , void **p_send_response_buf , HYPRE_Int *response_message_size ); hypre_ParCSRMatrix *hypre_ParCSRMatrixCompleteClone ( hypre_ParCSRMatrix *A ); hypre_ParCSRMatrix *hypre_ParCSRMatrixUnion ( hypre_ParCSRMatrix *A , hypre_ParCSRMatrix *B ); +#ifdef HYPRE_USE_GPU +hypre_int hypre_ParCSRMatrixIsManaged(hypre_ParCSRMatrix *a); +#endif /* parcsr_matrix.c */ @@ -869,6 +872,9 @@ HYPRE_Int hypre_ParVectorReadIJ ( MPI_Comm comm , const char *filename , HYPRE_Int *base_j_ptr , hypre_ParVector **vector_ptr ); HYPRE_Int hypre_FillResponseParToVectorAll ( void *p_recv_contact_buf , HYPRE_Int contact_size , HYPRE_Int contact_proc , void *ro , MPI_Comm comm , void **p_send_response_buf , HYPRE_Int *response_message_size ); HYPRE_Complex hypre_ParVectorLocalSumElts ( hypre_ParVector *vector ); +#ifdef HYPRE_USE_GPU +hypre_int hypre_ParVectorIsManaged(hypre_ParVector *vector); +#endif #ifdef __cplusplus } diff -Nru hypre-2.11.2/src/parcsr_mv/par_csr_communication.c hypre-2.13.0/src/parcsr_mv/par_csr_communication.c --- hypre-2.11.2/src/parcsr_mv/par_csr_communication.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_mv/par_csr_communication.c 2017-10-20 17:42:22.000000000 +0000 @@ -71,7 +71,7 @@ case HYPRE_COMM_PKG_JOB_COMPLEX: if (!send_data) { - send_data = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends)); + send_data = hypre_PinnedTAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends)); } if (!recv_data) { diff -Nru hypre-2.11.2/src/parcsr_mv/par_csr_matrix.c hypre-2.13.0/src/parcsr_mv/par_csr_matrix.c --- hypre-2.11.2/src/parcsr_mv/par_csr_matrix.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_mv/par_csr_matrix.c 2017-10-20 17:42:22.000000000 +0000 @@ -2153,3 +2153,11 @@ return C; } +#ifdef HYPRE_USE_GPU +hypre_int hypre_ParCSRMatrixIsManaged(hypre_ParCSRMatrix *a){ + if (hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(a))) + return ((hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixDiag(a))) && (hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixOffd(a)))); + else + return hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixDiag(a)); +} +#endif diff -Nru hypre-2.11.2/src/parcsr_mv/par_csr_matvec.c hypre-2.13.0/src/parcsr_mv/par_csr_matvec.c --- hypre-2.11.2/src/parcsr_mv/par_csr_matvec.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/parcsr_mv/par_csr_matvec.c 2017-10-20 17:42:22.000000000 +0000 @@ -18,6 +18,12 @@ #include "_hypre_parcsr_mv.h" #include +//#ifdef HYPRE_USE_GPU +//extern "C" +//{ +//void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, HYPRE_Int *send_map, HYPRE_Int begin,HYPRE_Int end,cudaStream_t s); +//} +//#endif /*-------------------------------------------------------------------------- * hypre_ParCSRMatrixMatvec @@ -66,7 +72,7 @@ * these conditions terminates processing, and the ierr flag * is informational only. *--------------------------------------------------------------------*/ - + PUSH_RANGE_PAYLOAD("PAR_CSR_MATVEC",5,x_size); hypre_assert( idxstride>0 ); if (num_cols != x_size) @@ -102,7 +108,7 @@ #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime(); #endif - + PUSH_RANGE("MPI_PACK",3); HYPRE_Int use_persistent_comm = 0; #ifdef HYPRE_USING_PERSISTENT_COMM use_persistent_comm = num_vectors == 1; @@ -115,6 +121,7 @@ if ( use_persistent_comm ) { #ifdef HYPRE_USING_PERSISTENT_COMM + PUSH_RANGE("PERCOMM1",0); persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(1, comm_pkg); HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); @@ -122,6 +129,7 @@ hypre_VectorData(x_tmp) = (HYPRE_Complex *)persistent_comm_handle->recv_data; hypre_SeqVectorSetDataOwner(x_tmp, 0); + POP_RANGE; #endif } else @@ -144,6 +152,20 @@ { HYPRE_Int begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0); HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends); +#ifdef HYPRE_USE_GPU + PUSH_RANGE("PERCOMM2DEVICE",4); +#ifdef HYPRE_USING_PERSISTENT_COMM + PackOnDevice((HYPRE_Complex*)persistent_comm_handle->send_data,x_local_data,hypre_ParCSRCommPkgSendMapElmts(comm_pkg),begin,end,HYPRE_STREAM(4)); + //PrintPointerAttributes(persistent_comm_handle->send_data); +#else + PackOnDevice((HYPRE_Complex*)x_buf_data[0],x_local_data,hypre_ParCSRCommPkgSendMapElmts(comm_pkg),begin,end,HYPRE_STREAM(4)); +#endif + POP_RANGE; + SetAsyncMode(1); + hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0); + SetAsyncMode(0); + //gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(7))); +#else #ifdef HYPRE_USING_OPENMP #pragma omp parallel for HYPRE_SMP_SCHEDULE #endif @@ -156,6 +178,7 @@ #endif = x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)]; } +#endif } else for ( jv=0; jvon_device=0; +#endif return matrix; } /*-------------------------------------------------------------------------- @@ -72,7 +76,7 @@ hypre_CSRMatrixData(matrix) = NULL; hypre_CSRMatrixJ(matrix) = NULL; } - hypre_TFree(matrix); + hypre_HostTFree(matrix); matrix = NULL; } @@ -670,3 +674,36 @@ { return hypre_CSRMatrixGetLoadBalancedPartitionBoundary(A, hypre_GetThreadNum() + 1); } +#ifdef HYPRE_USE_GPU +void hypre_CSRMatrixPrefetchToDevice(hypre_CSRMatrix *A){ + if (hypre_CSRMatrixNumNonzeros(A)==0) return; + + PUSH_RANGE_PAYLOAD("hypre_CSRMatrixPrefetchToDevice",0,hypre_CSRMatrixNumNonzeros(A)); + if ((!A->on_device)&&(hypre_CSRMatrixNumNonzeros(A)>8192)){ + gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixData(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(4))); + gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixI(A),(hypre_CSRMatrixNumRows(A)+1)*sizeof(HYPRE_Int),HYPRE_DEVICE,HYPRE_STREAM(5))); + gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixJ(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Int),HYPRE_DEVICE,HYPRE_STREAM(6))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(5))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(6))); + A->on_device=1; + } + POP_RANGE; +} +void hypre_CSRMatrixPrefetchToHost(hypre_CSRMatrix *A){ + PUSH_RANGE("hypre_CSRMatrixPrefetchToDevice",0); + if (A->on_device){ + A->on_device=0; + gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixData(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Complex),cudaCpuDeviceId,HYPRE_STREAM(4))); + gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixI(A),(hypre_CSRMatrixNumRows(A)+1)*sizeof(HYPRE_Int),cudaCpuDeviceId,HYPRE_STREAM(4))); + gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixJ(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Int),cudaCpuDeviceId,HYPRE_STREAM(4))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4))); + } + POP_RANGE; +} +hypre_int hypre_CSRMatrixIsManaged(hypre_CSRMatrix *a){ + return ((pointerIsManaged((void*)hypre_CSRMatrixData(a))) + && (pointerIsManaged((void*)hypre_CSRMatrixI(a))) + && (pointerIsManaged((void*)hypre_CSRMatrixJ(a)))); +} +#endif diff -Nru hypre-2.11.2/src/seq_mv/csr_matvec.c hypre-2.13.0/src/seq_mv/csr_matvec.c --- hypre-2.11.2/src/seq_mv/csr_matvec.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/csr_matvec.c 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,7 @@ #include "seq_mv.h" #include + /*-------------------------------------------------------------------------- * hypre_CSRMatrixMatvec *--------------------------------------------------------------------------*/ @@ -36,7 +37,15 @@ #ifdef HYPRE_PROFILE HYPRE_Real time_begin = hypre_MPI_Wtime(); #endif - +#ifdef HYPRE_USE_GPU + PUSH_RANGE_PAYLOAD("MATVEC",0, hypre_CSRMatrixNumRows(A)); + HYPRE_Int ret=hypre_CSRMatrixMatvecDevice( alpha,A,x,beta,b,y,offset); + POP_RANGE; + return ret; +#ifdef HYPRE_PROFILE + hypre_profile_times[HYPRE_TIMER_ID_MATVEC] += hypre_MPI_Wtime() - time_begin; +#endif +#endif HYPRE_Complex *A_data = hypre_CSRMatrixData(A); HYPRE_Int *A_i = hypre_CSRMatrixI(A) + offset; HYPRE_Int *A_j = hypre_CSRMatrixJ(A); @@ -765,3 +774,77 @@ return ierr; } +#ifdef HYPRE_USE_GPU +HYPRE_Int +hypre_CSRMatrixMatvecDevice( HYPRE_Complex alpha, + hypre_CSRMatrix *A, + hypre_Vector *x, + HYPRE_Complex beta, + hypre_Vector *b, + hypre_Vector *y, + HYPRE_Int offset ) +{ + + static cusparseHandle_t handle; + static cusparseMatDescr_t descr; + static HYPRE_Int FirstCall=1; + cusparseStatus_t status; + static cudaStream_t s[10]; + static HYPRE_Int myid; + + if (b!=y){ + + PUSH_RANGE_PAYLOAD("MEMCPY",1,y->size-offset); + VecCopy(y->data,b->data,(y->size-offset),HYPRE_STREAM(4)); + POP_RANGE + } + + if (x==y) fprintf(stderr,"ERROR::x and y are the same pointer in hypre_CSRMatrixMatvecDevice\n"); + + if (FirstCall){ + PUSH_RANGE("FIRST_CALL",4); + + handle=getCusparseHandle(); + + status= cusparseCreateMatDescr(&descr); + if (status != CUSPARSE_STATUS_SUCCESS) { + printf("ERROR:: Matrix descriptor initialization failed\n"); + exit(2); + } + + cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); + + FirstCall=0; + hypre_int jj; + for(jj=0;jj<5;jj++) + s[jj]=HYPRE_STREAM(jj); + nvtxNameCudaStreamA(s[4], "HYPRE_COMPUTE_STREAM"); + hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); + myid++; + POP_RANGE; + } + + PUSH_RANGE("PREFETCH+SPMV",2); + + hypre_CSRMatrixPrefetchToDevice(A); + hypre_SeqVectorPrefetchToDevice(x); + hypre_SeqVectorPrefetchToDevice(y); + + if (offset!=0) printf("WARNING:: Offset is not zero in hypre_CSRMatrixMatvecDevice :: %d \n",offset); + cusparseErrchk(cusparseDcsrmv(handle , + CUSPARSE_OPERATION_NON_TRANSPOSE, + A->num_rows-offset, A->num_cols, A->num_nonzeros, + &alpha, descr, + A->data ,A->i+offset,A->j, + x->data, &beta, y->data+offset)); + + if (!GetAsyncMode()){ + gpuErrchk(cudaStreamSynchronize(s[4])); + } + POP_RANGE; + + return 0; + +} +#endif diff -Nru hypre-2.11.2/src/seq_mv/gpukernels.cu hypre-2.13.0/src/seq_mv/gpukernels.cu --- hypre-2.11.2/src/seq_mv/gpukernels.cu 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/gpukernels.cu 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,243 @@ +#include +#include +//#include +#include "_hypre_utilities.h" +#define gpuErrchk2(ans) { gpuAssert2((ans), __FILE__, __LINE__); } +inline void gpuAssert2(cudaError_t code, const char *file, hypre_int line) +{ + if (code != cudaSuccess) + { + printf("GPUassert2: %s %s %d\n", cudaGetErrorString(code), file, line); + exit(2); + } +} + + + +extern "C"{ + __global__ + void VecScaleKernelText(HYPRE_Complex *__restrict__ u, const HYPRE_Complex *__restrict__ v, const HYPRE_Complex *__restrict__ l1_norm, hypre_int num_rows){ + hypre_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i>>(u,v,l1_norm,num_rows); +#ifdef CATCH_LAUNCH_ERRORS + gpuErrchk2(cudaPeekAtLastError()); + gpuErrchk2(cudaDeviceSynchronize()); +#endif + gpuErrchk2(cudaStreamSynchronize(s)); + POP_RANGE; + } +} + + +extern "C"{ + + __global__ + void VecCopyKernel(HYPRE_Complex* __restrict__ tgt, const HYPRE_Complex* __restrict__ src, hypre_int size){ + hypre_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i>>(tgt,src,size); + //gpuErrchk2(cudaStreamSynchronize(s)); + POP_RANGE; + } +} +extern "C"{ + + __global__ + void VecSetKernel(HYPRE_Complex* __restrict__ tgt, const HYPRE_Complex value,hypre_int size){ + hypre_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i>>(tgt,value,size); + cudaStreamSynchronize(s); + //cudaDeviceSynchronize(); + } +} +extern "C"{ + __global__ + void PackOnDeviceKernel(HYPRE_Complex* __restrict__ send_data,const HYPRE_Complex* __restrict__ x_local_data, const hypre_int* __restrict__ send_map, hypre_int begin,hypre_int end){ + hypre_int i = begin+blockIdx.x * blockDim.x + threadIdx.x; + if (i>>(send_data,x_local_data,send_map,begin,end); +#ifdef CATCH_LAUNCH_ERRORS + gpuErrchk2(cudaPeekAtLastError()); + gpuErrchk2(cudaDeviceSynchronize()); +#endif + PUSH_RANGE("PACK_PREFETCH",1); +#ifndef HYPRE_GPU_USE_PINNED + MemPrefetchSized((void*)send_data,(end-begin)*sizeof(HYPRE_Complex),cudaCpuDeviceId,s); +#endif + POP_RANGE; + //gpuErrchk2(cudaStreamSynchronize(s)); + } +} + + // Scale vector by scalar + +extern "C"{ +__global__ +void VecScaleScalarKernel(HYPRE_Complex *__restrict__ u, const HYPRE_Complex alpha ,hypre_int num_rows){ + hypre_int i = blockIdx.x * blockDim.x + threadIdx.x; + //if (i<5) printf("DEVICE %d %lf %lf %lf\n",i,u[i],v[i],l1_norm[i]); + if (i>>(u,alpha,num_rows); +#ifdef CATCH_LAUNCH_ERRORS + gpuErrchk2(cudaPeekAtLastError()); + gpuErrchk2(cudaDeviceSynchronize()); +#endif + gpuErrchk2(cudaStreamSynchronize(s)); + POP_RANGE; + return 0; + } +} + + +extern "C"{ +__global__ +void SpMVCudaKernel(HYPRE_Complex* __restrict__ y,HYPRE_Complex alpha, const HYPRE_Complex* __restrict__ A_data, const hypre_int* __restrict__ A_i, const hypre_int* __restrict__ A_j, const HYPRE_Complex* __restrict__ x, HYPRE_Complex beta, hypre_int num_rows) +{ + hypre_int i= blockIdx.x * blockDim.x + threadIdx.x; + if (i>>(y,alpha,A_data,A_i,A_j,x,num_rows); + else + SpMVCudaKernel<<>>(y,alpha,A_data,A_i,A_j,x,beta,num_rows); +#ifdef CATCH_LAUNCH_ERRORS + gpuErrchk2(cudaPeekAtLastError()); + gpuErrchk2(cudaDeviceSynchronize()); +#endif + +} +} +extern "C"{ + __global__ + void CompileFlagSafetyCheck(hypre_int actual){ +#ifdef __CUDA_ARCH__ + hypre_int cudarch=__CUDA_ARCH__; + if (cudarch!=actual){ + printf("WARNING :: nvcc -arch flag does not match actual device architecture\nWARNING :: The code can fail silently and produce wrong results\n"); + printf("Arch specified at compile = sm_%d Actual device = sm_%d\n",cudarch/10,actual/10); + } +#else + printf("ERROR:: CUDA_ ARCH is not defined \n This should not be happening\n"); +#endif + } +} +extern "C"{ + void CudaCompileFlagCheck(){ + hypre_int devCount; + cudaGetDeviceCount(&devCount); + hypre_int i; + hypre_int cudarch_actual; + for(i = 0; i < devCount; ++i) + { + struct cudaDeviceProp props; + cudaGetDeviceProperties(&props, i); + cudarch_actual=props.major*100+props.minor*10; + } + gpuErrchk2(cudaPeekAtLastError()); + gpuErrchk2(cudaDeviceSynchronize()); + CompileFlagSafetyCheck<<<1,1,0,0>>>(cudarch_actual); + cudaError_t code=cudaPeekAtLastError(); + if (code != cudaSuccess) + { + fprintf(stderr,"ERROR in CudaCompileFlagCheck%s \n", cudaGetErrorString(code)); + fprintf(stderr,"ERROR :: Check if compile arch flags match actual device arch = sm_%d\n",cudarch_actual/10); + exit(2); + } + gpuErrchk2(cudaDeviceSynchronize()); + } +} diff -Nru hypre-2.11.2/src/seq_mv/gpukernels.h hypre-2.13.0/src/seq_mv/gpukernels.h --- hypre-2.11.2/src/seq_mv/gpukernels.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/gpukernels.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,9 @@ +#ifdef HYPRE_USE_GPU +#include +int VecScaleScalar(double *u, const double alpha, int num_rows,cudaStream_t s); +void VecCopy(double* tgt, const double* src, int size,cudaStream_t s); +void VecSet(double* tgt, int size, double value, cudaStream_t s); +void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void CudaCompileFlagCheck(); +#endif diff -Nru hypre-2.11.2/src/seq_mv/headers hypre-2.13.0/src/seq_mv/headers --- hypre-2.11.2/src/seq_mv/headers 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/headers 2017-10-20 17:42:22.000000000 +0000 @@ -49,6 +49,7 @@ cat mapped_matrix.h >> $INTERNAL_HEADER cat multiblock_matrix.h >> $INTERNAL_HEADER cat vector.h >> $INTERNAL_HEADER +cat gpukernel.h >> $INTERNAL_HEADER ../utilities/protos *.c >> $INTERNAL_HEADER diff -Nru hypre-2.11.2/src/seq_mv/Makefile hypre-2.13.0/src/seq_mv/Makefile --- hypre-2.11.2/src/seq_mv/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -13,6 +13,9 @@ include ../config/Makefile.config +FILES_NVCC = +include $(HYPRE_NVCC_MAKEFILE) + CINCLUDES = ${INCLUDES} ${MPIINCLUDE} C_COMPILE_FLAGS =\ @@ -44,6 +47,7 @@ vector.c OBJS = ${FILES:.c=.o} +CUOBJS = ${FILES_NVCC:.cu=.o} SONAME = libHYPRE_seq_mv-${HYPRE_RELEASE_VERSION}.so @@ -71,12 +75,12 @@ # Rules ################################################################## -libHYPRE_seq_mv.a: ${OBJS} +libHYPRE_seq_mv.a: ${OBJS} ${CUOBJS} @echo "Building $@ ... " ${AR} $@ ${OBJS} ${RANLIB} $@ -libHYPRE_seq_mv.so: ${OBJS} +libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS} @echo "Building $@ ... " ${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME} ln -s ${SONAME} $@ diff -Nru hypre-2.11.2/src/seq_mv/Makefile.empty hypre-2.13.0/src/seq_mv/Makefile.empty --- hypre-2.11.2/src/seq_mv/Makefile.empty 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/Makefile.empty 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1 @@ + diff -Nru hypre-2.11.2/src/seq_mv/Makefile.nvcc hypre-2.13.0/src/seq_mv/Makefile.nvcc --- hypre-2.11.2/src/seq_mv/Makefile.nvcc 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/Makefile.nvcc 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,4 @@ + +FILES_NVCC =\ + gpukernels.cu + diff -Nru hypre-2.11.2/src/seq_mv/seq_mv.h hypre-2.13.0/src/seq_mv/seq_mv.h --- hypre-2.11.2/src/seq_mv/seq_mv.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/seq_mv.h 2017-10-20 17:42:22.000000000 +0000 @@ -59,6 +59,11 @@ HYPRE_Int *rownnz; HYPRE_Int num_rownnz; +#ifdef HYPRE_USE_GPU + /* Flag to keeping track of prefetching */ + HYPRE_Int on_device; +#endif + } hypre_CSRMatrix; /*-------------------------------------------------------------------------- @@ -208,6 +213,9 @@ With rowwise storage, vj[i] = data[ j + num_vectors*i] */ HYPRE_Int vecstride, idxstride; /* ... so vj[i] = data[ j*vecstride + i*idxstride ] regardless of row_storage.*/ +#ifdef HYPRE_USE_GPU + HYPRE_Int on_device; +#endif } hypre_Vector; @@ -225,6 +233,20 @@ #endif +#ifndef hypre_GPUKERNELS_HEADER +#define hypre_GPUKERNELS_HEADER +#ifdef HYPRE_USE_GPU +#include +int VecScaleScalar(double *u, const double alpha, int num_rows,cudaStream_t s); +void VecCopy(double* tgt, const double* src, int size,cudaStream_t s); +void VecSet(double* tgt, int size, double value, cudaStream_t s); +void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void CudaCompileFlagCheck(); +void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, hypre_int *send_map, hypre_int begin,hypre_int end,cudaStream_t s); +#endif +#endif + /* csr_matop.c */ hypre_CSRMatrix *hypre_CSRMatrixAdd ( hypre_CSRMatrix *A , hypre_CSRMatrix *B ); hypre_CSRMatrix *hypre_CSRMatrixMultiply ( hypre_CSRMatrix *A , hypre_CSRMatrix *B ); @@ -245,6 +267,11 @@ HYPRE_Int hypre_CSRMatrixCopy ( hypre_CSRMatrix *A , hypre_CSRMatrix *B , HYPRE_Int copy_data ); hypre_CSRMatrix *hypre_CSRMatrixClone ( hypre_CSRMatrix *A ); hypre_CSRMatrix *hypre_CSRMatrixUnion ( hypre_CSRMatrix *A , hypre_CSRMatrix *B , HYPRE_Int *col_map_offd_A , HYPRE_Int *col_map_offd_B , HYPRE_Int **col_map_offd_C ); +#ifdef HYPRE_USE_GPU +void hypre_CSRMatrixPrefetchToDevice(hypre_CSRMatrix *A); +void hypre_CSRMatrixPrefetchToHost(hypre_CSRMatrix *A); +hypre_int hypre_CSRMatrixIsManaged(hypre_CSRMatrix *a); +#endif /* csr_matvec.c */ // y[offset:end] = alpha*A[offset:end,:]*x + beta*b[offset:end] @@ -253,7 +280,9 @@ HYPRE_Int hypre_CSRMatrixMatvec ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y ); HYPRE_Int hypre_CSRMatrixMatvecT ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y ); HYPRE_Int hypre_CSRMatrixMatvec_FF ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y , HYPRE_Int *CF_marker_x , HYPRE_Int *CF_marker_y , HYPRE_Int fpt ); - +#ifdef HYPRE_USE_GPU +HYPRE_Int hypre_CSRMatrixMatvecDevice( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *b, hypre_Vector *y, HYPRE_Int offset ); +#endif /* genpart.c */ HYPRE_Int hypre_GeneratePartitioning ( HYPRE_Int length , HYPRE_Int num_procs , HYPRE_Int **part_ptr ); HYPRE_Int hypre_GenerateLocalPartitioning ( HYPRE_Int length , HYPRE_Int num_procs , HYPRE_Int myid , HYPRE_Int **part_ptr ); @@ -337,7 +366,16 @@ HYPRE_Int hypre_SeqVectorAxpy ( HYPRE_Complex alpha , hypre_Vector *x , hypre_Vector *y ); HYPRE_Real hypre_SeqVectorInnerProd ( hypre_Vector *x , hypre_Vector *y ); HYPRE_Complex hypre_VectorSumElts ( hypre_Vector *vector ); - +#ifdef HYPRE_USE_GPU +HYPRE_Complex hypre_VectorSumAbsElts ( hypre_Vector *vector ); +HYPRE_Int hypre_SeqVectorCopyDevice ( hypre_Vector *x , hypre_Vector *y ); +HYPRE_Int hypre_SeqVectorAxpyDevice( HYPRE_Complex alpha , hypre_Vector *x , hypre_Vector *y ); +HYPRE_Real hypre_SeqVectorInnerProdDevice ( hypre_Vector *x , hypre_Vector *y ); +void hypre_SeqVectorPrefetchToDevice(hypre_Vector *x); +void hypre_SeqVectorPrefetchToHost(hypre_Vector *x); +void hypre_SeqVectorPrefetchToDeviceInStream(hypre_Vector *x, HYPRE_Int index); +hypre_int hypre_SeqVectorIsManaged(hypre_Vector *x); +#endif #ifdef __cplusplus } #endif diff -Nru hypre-2.11.2/src/seq_mv/vector.c hypre-2.13.0/src/seq_mv/vector.c --- hypre-2.11.2/src/seq_mv/vector.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/seq_mv/vector.c 2017-10-20 17:42:22.000000000 +0000 @@ -18,6 +18,11 @@ #include "seq_mv.h" #include +#ifdef HYPRE_USE_GPU +#include +#include +#include "gpukernels.h" +#endif /*-------------------------------------------------------------------------- * hypre_SeqVectorCreate @@ -28,7 +33,11 @@ { hypre_Vector *vector; - vector = hypre_CTAlloc(hypre_Vector, 1); + vector = hypre_HostCTAlloc(hypre_Vector, 1); + +#ifdef HYPRE_USE_GPU + vector->on_device=0; +#endif hypre_VectorData(vector) = NULL; hypre_VectorSize(vector) = size; @@ -69,7 +78,7 @@ { hypre_TFree(hypre_VectorData(vector)); } - hypre_TFree(vector); + hypre_HostTFree(vector); } return ierr; @@ -246,6 +255,10 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v, HYPRE_Complex value ) { +#ifdef HYPRE_USE_GPU + VecSet(hypre_VectorData(v),hypre_VectorSize(v),value,HYPRE_STREAM(4)); + return 0; +#endif #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif @@ -310,6 +323,9 @@ hypre_SeqVectorCopy( hypre_Vector *x, hypre_Vector *y ) { +#ifdef HYPRE_USE_GPU + return hypre_SeqVectorCopyDevice(x,y); +#endif #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif @@ -394,7 +410,10 @@ #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif - + +#ifdef HYPRE_USE_GPU + return VecScaleScalar(y->data,alpha, hypre_VectorSize(y),HYPRE_STREAM(4)); +#endif HYPRE_Complex *y_data = hypre_VectorData(y); HYPRE_Int size = hypre_VectorSize(y); @@ -426,6 +445,9 @@ hypre_Vector *x, hypre_Vector *y ) { +#ifdef HYPRE_USE_GPU + return hypre_SeqVectorAxpyDevice(alpha,x,y); +#endif #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif @@ -460,6 +482,9 @@ HYPRE_Real hypre_SeqVectorInnerProd( hypre_Vector *x, hypre_Vector *y ) { +#ifdef HYPRE_USE_GPU + return hypre_SeqVectorInnerProdDevice(x,y); +#endif #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime(); #endif @@ -506,3 +531,133 @@ return sum; } + +#ifdef HYPRE_USE_GPU +/* Sums of the absolute value of the elements for comparison to cublas device side routine */ +HYPRE_Complex hypre_VectorSumAbsElts( hypre_Vector *vector ) +{ + HYPRE_Complex sum = 0; + HYPRE_Complex *data = hypre_VectorData( vector ); + HYPRE_Int size = hypre_VectorSize( vector ); + HYPRE_Int i; + +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(i) reduction(+:sum) HYPRE_SMP_SCHEDULE +#endif + for ( i=0; i size_y) size = size_y; + size *=hypre_VectorNumVectors(x); + PUSH_RANGE_PAYLOAD("VECCOPYDEVICE",2,size); + hypre_SeqVectorPrefetchToDevice(x); + hypre_SeqVectorPrefetchToDevice(y); + VecCopy(y_data,x_data,size,HYPRE_STREAM(4)); + cudaStreamSynchronize(HYPRE_STREAM(4)); + POP_RANGE; + return ierr; +} +HYPRE_Int +hypre_SeqVectorAxpyDevice( HYPRE_Complex alpha, + hypre_Vector *x, + hypre_Vector *y ){ + + HYPRE_Complex *x_data = hypre_VectorData(x); + HYPRE_Complex *y_data = hypre_VectorData(y); + HYPRE_Int size = hypre_VectorSize(x); + + HYPRE_Int i; + + HYPRE_Int ierr = 0; + cublasStatus_t stat; + size *=hypre_VectorNumVectors(x); + + PUSH_RANGE_PAYLOAD("DEVAXPY",0,hypre_VectorSize(x)); + hypre_SeqVectorPrefetchToDevice(x); + hypre_SeqVectorPrefetchToDevice(y); + static cublasHandle_t handle; + static HYPRE_Int firstcall=1; + if (firstcall){ + handle=getCublasHandle(); + firstcall=0; + } + cublasErrchk(cublasDaxpy(handle,(HYPRE_Int)size,&alpha,x_data,1,y_data,1)); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4))); + POP_RANGE; + return ierr; +} + +HYPRE_Real hypre_SeqVectorInnerProdDevice( hypre_Vector *x, + hypre_Vector *y ) +{ + PUSH_RANGE_PAYLOAD("DEVDOT",4,hypre_VectorSize(x)); + static cublasHandle_t handle; + static HYPRE_Int firstcall=1; + + HYPRE_Complex *x_data = hypre_VectorData(x); + HYPRE_Complex *y_data = hypre_VectorData(y); + HYPRE_Int size = hypre_VectorSize(x); + + HYPRE_Int i; + + HYPRE_Real result = 0.0; + cublasStatus_t stat; + if (firstcall){ + handle = getCublasHandle(); + firstcall=0; + } + PUSH_RANGE_PAYLOAD("DEVDOT-PRFETCH",5,hypre_VectorSize(x)); + //hypre_SeqVectorPrefetchToDevice(x); + //hypre_SeqVectorPrefetchToDevice(y); + POP_RANGE; + PUSH_RANGE_PAYLOAD("DEVDOT-ACTUAL",0,hypre_VectorSize(x)); + stat=cublasDdot(handle, (HYPRE_Int)size, + x_data, 1, + y_data, 1, + &result); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4))); + POP_RANGE; + POP_RANGE; + return result; + +} +void hypre_SeqVectorPrefetchToDevice(hypre_Vector *x){ + if (hypre_VectorSize(x)==0) return; + PUSH_RANGE("hypre_SeqVectorPrefetchToDevice",0); + gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(4))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4))); + POP_RANGE; +} +void hypre_SeqVectorPrefetchToHost(hypre_Vector *x){ + if (hypre_VectorSize(x)==0) return; + PUSH_RANGE("hypre_SeqVectorPrefetchToHost",0); + gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),cudaCpuDeviceId,HYPRE_STREAM(4))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4))); + POP_RANGE; +} +void hypre_SeqVectorPrefetchToDeviceInStream(hypre_Vector *x, HYPRE_Int index){ + if (hypre_VectorSize(x)==0) return; + PUSH_RANGE("hypre_SeqVectorPrefetchToDevice",0); + gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(index))); + gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(index))); + POP_RANGE; +} +hypre_int hypre_SeqVectorIsManaged(hypre_Vector *x){ + return pointerIsManaged((void*)hypre_VectorData(x)); +} +#endif diff -Nru hypre-2.11.2/src/sstruct_ls/fac_amr_fcoarsen.c hypre-2.13.0/src/sstruct_ls/fac_amr_fcoarsen.c --- hypre-2.11.2/src/sstruct_ls/fac_amr_fcoarsen.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_amr_fcoarsen.c 2017-10-20 17:42:22.000000000 +0000 @@ -53,13 +53,13 @@ } -#define AbsStencilShape(stencil, abs_shape) \ - { \ - HYPRE_Int ii,jj,kk; \ - ii = hypre_IndexX(stencil); \ - jj = hypre_IndexY(stencil); \ - kk = hypre_IndexZ(stencil); \ - abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \ +#define AbsStencilShape(stencil, abs_shape) \ + { \ + HYPRE_Int ii,jj,kk; \ + ii = hypre_IndexX(stencil); \ + jj = hypre_IndexY(stencil); \ + kk = hypre_IndexZ(stencil); \ + abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \ } /*-------------------------------------------------------------------------- @@ -157,7 +157,7 @@ HYPRE_Int i, j, k, l, m, n, ll, kk, jj; HYPRE_Int nvars, var1, var2, var2_start; - HYPRE_Int iA, iAc, iA_shift_z, iA_shift_zy, iA_shift_zyx; + HYPRE_Int iA_shift_z, iA_shift_zy, iA_shift_zyx; hypre_Index lindex; hypre_Index index1, index2; @@ -1868,19 +1868,17 @@ /*---------------------------------------------------------------- * Loop over interior grid box. *----------------------------------------------------------------*/ + hypre_BoxGetSize(&fine_box, loop_size); - hypre_BoxLoop2Begin(ndim, loop_size, - A_dbox, fstart, stridef, iA, - crse_dbox, cstart, stridec, iAc); + hypre_SerialBoxLoop2Begin(ndim, loop_size, + A_dbox, fstart, stridef, iA, + crse_dbox, cstart, stridec, iAc); #if 0 #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,i,rank,index1,index2,m,l,k,j,iA_shift_z,iA_shift_zy,iA_shift_zyx,stencil_i,sum,vals) HYPRE_SMP_SCHEDULE #endif -#else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop2For(iA, iAc) { for (i= 0; i< stencil_size; i++) { @@ -1986,7 +1984,7 @@ } crse_ptrs[ rank_stencils[0] ][iAc]= sum; } - hypre_BoxLoop2End(iA, iAc); + hypre_SerialBoxLoop2End(iA, iAc); } /* end hypre_ForBoxI(fi, fbox_interior_ci) */ /*------------------------------------------------------------------ @@ -2050,19 +2048,12 @@ /*-------------------------------------------------------------- * Loop over boundary grid box. *--------------------------------------------------------------*/ + hypre_BoxGetSize(&fine_box, loop_size); - hypre_BoxLoop2Begin(ndim, loop_size, - A_dbox, fstart, stridef, iA, - crse_dbox, cstart, stridec, iAc); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,i,rank,index1,index2,m,l,k,j,iA_shift_z,iA_shift_zy,iA_shift_zyx,stencil_i,temp3,ll,kk,jj,temp2,cnt1,index_temp,boxman_entry,found,Uventry,nUentries,ncols,rows,cols,vals2,sum,vals) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop2For(iA, iAc) + hypre_SerialBoxLoop2Begin(ndim, loop_size, + A_dbox, fstart, stridef, iA, + crse_dbox, cstart, stridec, iAc); { hypre_BoxLoopGetIndex(lindex); for (i= 0; i< stencil_size; i++) @@ -2338,7 +2329,7 @@ hypre_TFree(temp3); } - hypre_BoxLoop2End(iA, iAc); + hypre_SerialBoxLoop2End(iA, iAc); } /* hypre_ForBoxI(fi, fbox_bdy_ci_fi) */ } /* hypre_ForBoxArrayI(arrayi, fbox_bdy_ci) */ @@ -2940,7 +2931,7 @@ } hypre_qsort1(interface_stencil_ranks[i], (HYPRE_Real *) temp1, 0, - coarse_stencil_cnt[i]-1); + coarse_stencil_cnt[i]-1); /*--------------------------------------------------------------- * swap the stencil_vals to agree with the rank swapping. @@ -3494,6 +3485,7 @@ #endif hypre_BoxLoop1For(iA) { + HYPRE_Int i; for (i= 0; i< stencil_size; i++) { if (i != centre) diff -Nru hypre-2.11.2/src/sstruct_ls/fac_amr_rap.c hypre-2.13.0/src/sstruct_ls/fac_amr_rap.c --- hypre-2.11.2/src/sstruct_ls/fac_amr_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_amr_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -86,7 +86,6 @@ hypre_Index index, stride, zero_index; HYPRE_Int nvars, var1, var2, part, cbox; HYPRE_Int i, j, k, size; - HYPRE_Int iA, iAc; HYPRE_Int myid; HYPRE_Int ierr= 0; @@ -220,7 +219,7 @@ smatrix_dbox, ilower, stride, iA, fac_smatrix_dbox, ilower, stride, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { @@ -277,7 +276,7 @@ smatrix_dbox, ilower, stride, iA, fac_smatrix_dbox, ilower, stride, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { @@ -415,7 +414,7 @@ smatrix_dbox, ilower, stride, iA, fac_smatrix_dbox, ilower, stride, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { diff -Nru hypre-2.11.2/src/sstruct_ls/fac_cf_coarsen.c hypre-2.13.0/src/sstruct_ls/fac_cf_coarsen.c --- hypre-2.11.2/src/sstruct_ls/fac_cf_coarsen.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_cf_coarsen.c 2017-10-20 17:42:22.000000000 +0000 @@ -48,17 +48,17 @@ jj= -1; \ if (kk==2) \ kk= -1; \ - hypre_SetIndex3(stencil, ii, jj, kk); \ + hypre_SetIndex3(stencil, ii, jj, kk); \ } -#define AbsStencilShape(stencil, abs_shape) \ - { \ - HYPRE_Int ii,jj,kk; \ - ii = hypre_IndexX(stencil); \ - jj = hypre_IndexY(stencil); \ - kk = hypre_IndexZ(stencil); \ - abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \ +#define AbsStencilShape(stencil, abs_shape) \ + { \ + HYPRE_Int ii,jj,kk; \ + ii = hypre_IndexX(stencil); \ + jj = hypre_IndexY(stencil); \ + kk = hypre_IndexZ(stencil); \ + abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \ } /*-------------------------------------------------------------------------- @@ -130,7 +130,7 @@ HYPRE_Int rank, startrank; HYPRE_Real *vals; - HYPRE_Int i, j, iA; + HYPRE_Int i, j; HYPRE_Int nvars, var1; hypre_Index lindex, zero_index; @@ -219,7 +219,7 @@ hypre_StructMapCoarseToFine(hypre_BoxIMin(cgrid_box), zero_index, refine_factors, hypre_BoxIMin(&refined_box)); hypre_SetIndex3(index1, refine_factors[0]-1, refine_factors[1]-1, - refine_factors[2]-1); + refine_factors[2]-1); hypre_StructMapCoarseToFine(hypre_BoxIMax(cgrid_box), index1, refine_factors, hypre_BoxIMax(&refined_box)); @@ -340,17 +340,9 @@ fgrid_cinterface= hypre_BoxArrayBox(cinterface_array, boxi); hypre_CopyIndex(hypre_BoxIMin(fgrid_cinterface), node_extents); hypre_BoxGetSize(fgrid_cinterface, loop_size); - - hypre_BoxLoop1Begin(ndim, loop_size, - A_dbox, node_extents, stridec, iA); -#if 0 /* Are private static arrays a problem? */ -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,lindex,i,index_temp,boxman_entry,rank,found,Uventry,nUentries,temp1,cnt1,ncols,rows,cols,temp2,vals,index2,index1,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(iA) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + A_dbox, node_extents, stridec, iA); { hypre_BoxLoopGetIndex(lindex); for (i= 0; i< stencil_size; i++) @@ -482,7 +474,7 @@ } /* if (Uventry != NULL) */ } /* if (nUventries > 0) */ } - hypre_BoxLoop1End(iA); + hypre_SerialBoxLoop1End(iA); } /* for (boxi= stencil_size; boxi< box_array_size; boxi++) */ } /* hypre_ForBoxArrayI(fi, cinterface_arrays) */ } /* hypre_ForBoxI(ci, cgrid_boxes) */ diff -Nru hypre-2.11.2/src/sstruct_ls/fac_interp2.c hypre-2.13.0/src/sstruct_ls/fac_interp2.c --- hypre-2.11.2/src/sstruct_ls/fac_interp2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_interp2.c 2017-10-20 17:42:22.000000000 +0000 @@ -739,9 +739,6 @@ hypre_StructVector *e_var; hypre_StructVector *recv_var; - HYPRE_Int xci; - HYPRE_Int ei; - HYPRE_Real ***xcp; HYPRE_Real ***ep; @@ -967,17 +964,9 @@ hypre_CopyIndex(hypre_BoxIMin(ownbox), startc); hypre_BoxGetSize(ownbox, loop_size); - hypre_BoxLoop2Begin(ndim, loop_size, - e_dbox, start, stride, ei, - xc_dbox, startc, stridec, xci); -#if 1 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci,lindex,imax,jmax,kmax,k,offset_kp1,zweight2,kshift,zweight1,j,offset_jp1,yweight2,jshift,yweight1,i,offset_ip1,xweight2,ishift,xweight1) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop2For(ei, xci) + hypre_SerialBoxLoop2Begin(ndim, loop_size, + e_dbox, start, stride, ei, + xc_dbox, startc, stridec, xci); { /*-------------------------------------------------------- * Linear interpolation. Determine the weights and the @@ -1191,7 +1180,7 @@ } /* for (j= 0; j< jmax; j++) */ } /* for (k= 0; k< kmax; k++) */ } - hypre_BoxLoop2End(ei, xci); + hypre_SerialBoxLoop2End(ei, xci); }/* hypre_ForBoxI(bi, own_abox) */ } /* hypre_ForBoxArray(fi, fgrid_boxes) */ @@ -1297,7 +1286,7 @@ for (j=0; j< jsize; j++) { hypre_SetIndex3(temp_index2, - ptr_ishift, j+ptr_jshift, k+ptr_kshift); + ptr_ishift, j+ptr_jshift, k+ptr_kshift); xcp[k][j]= hypre_StructVectorBoxData(recv_var, bi) + hypre_BoxOffsetDistance(xc_dbox, temp_index2); } @@ -1306,17 +1295,9 @@ hypre_CopyIndex(hypre_BoxIMin(ownbox), startc); hypre_BoxGetSize(ownbox, loop_size); - hypre_BoxLoop2Begin(ndim, loop_size, - e_dbox, start, stride, ei, - xc_dbox, startc, stridec, xci); -#if 1 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci,lindex,imax,jmax,kmax,k,offset_kp1,zweight2,kshift,zweight1,j,offset_jp1,yweight2,jshift,yweight1,i,offset_ip1,xweight2,ishift,xweight1) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop2For(ei, xci) + hypre_SerialBoxLoop2Begin(ndim, loop_size, + e_dbox, start, stride, ei, + xc_dbox, startc, stridec, xci); { /*-------------------------------------------------------- * Linear interpolation. Determine the weights and the @@ -1533,7 +1514,7 @@ } /* for (j= 0; j< jmax; j++) */ } /* for (k= 0; k< kmax; k++) */ } - hypre_BoxLoop2End(ei, xci); + hypre_SerialBoxLoop2End(ei, xci); } /* if (hypre_BoxVolume(ownbox)) */ } /* hypre_ForBoxI(bi, own_abox) */ diff -Nru hypre-2.11.2/src/sstruct_ls/fac_restrict2.c hypre-2.13.0/src/sstruct_ls/fac_restrict2.c --- hypre-2.11.2/src/sstruct_ls/fac_restrict2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_restrict2.c 2017-10-20 17:42:22.000000000 +0000 @@ -38,7 +38,7 @@ ii = (ij%2); \ jj = (ij-ii)/2; \ kk = (rank-2*jj-ii)/4; \ - hypre_SetIndex3(stencil, ii, jj, kk); \ + hypre_SetIndex3(stencil, ii, jj, kk); \ } /*-------------------------------------------------------------------------- @@ -60,8 +60,8 @@ hypre_CommPkg **interlevel_comm; /* hypre_CommPkg **intralevel_comm;*/ /* may need to build an intra comm so - that each processor only fullwts its - own fine data- may need to add contrib */ + that each processor only fullwts its + own fine data- may need to add contrib */ } hypre_FacSemiRestrictData2; @@ -518,9 +518,6 @@ hypre_StructVector *xc_var; hypre_StructVector *xf_var; - HYPRE_Int xci; - HYPRE_Int xfi; - HYPRE_Real ***xfp; HYPRE_Real ***xcp; HYPRE_Real ***xcp_temp; @@ -730,17 +727,9 @@ hypre_BoxGetSize(fgrid_box, temp_index1); hypre_StructMapFineToCoarse(temp_index1, temp_index2, rfactors, loop_size); - hypre_BoxLoop2Begin(ndim, loop_size, - xf_dbox, start, stride, xfi, - xc_temp_dbox, startc, stridec, xci); -#if 0 /* Are private static arrays a problem? */ -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xfi,xci,imax,jmax,kmax,k,kcell,j,jcell,i,icell,ijkcell,temp_index2) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop2For(xfi, xci) + hypre_SerialBoxLoop2Begin(ndim, loop_size, + xf_dbox, start, stride, xfi, + xc_temp_dbox, startc, stridec, xci); { /*----------------------------------------------------------------- * Arithmetic average the refinement patch values to get @@ -804,7 +793,7 @@ } } - hypre_BoxLoop2End(xfi, xci); + hypre_SerialBoxLoop2End(xfi, xci); } /* hypre_ForBoxI(fi, fgrid_boxes) */ } /* for (var= 0; var< nvars; var++)*/ diff -Nru hypre-2.11.2/src/sstruct_ls/fac_setup2.c hypre-2.13.0/src/sstruct_ls/fac_setup2.c --- hypre-2.11.2/src/sstruct_ls/fac_setup2.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_setup2.c 2017-10-20 17:42:22.000000000 +0000 @@ -25,7 +25,7 @@ hypre_SStructVector *b, hypre_SStructVector *x ) { - hypre_FACData *fac_data = (hypre_FACData*)fac_vdata; + hypre_FACData *fac_data = (hypre_FACData*)fac_vdata; HYPRE_Int *plevels = (fac_data-> plevels); hypre_Index *rfactors = (fac_data-> prefinements); @@ -105,7 +105,6 @@ HYPRE_Int *stencil_vars; HYPRE_Real *values; HYPRE_Real *A_smatrix_value; - HYPRE_Int iA; HYPRE_Int *nrows; HYPRE_Int **ncols; @@ -126,8 +125,8 @@ HYPRE_Int ierr = 0; /*hypre_SStructMatrix *nested_A; -nested_A= hypre_TAlloc(hypre_SStructMatrix , 1); -nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/ + nested_A= hypre_TAlloc(hypre_SStructMatrix , 1); + nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/ /* generate the composite operator with the computed coarse-grid operators */ hypre_AMR_RAP(A_in, rfactors, &A_rap); @@ -498,7 +497,7 @@ sgrid_box, box_start, stride, k, A_smatrix_dbox, box_start, stride, iA); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,k,iA) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(k, iA) { @@ -573,7 +572,7 @@ sgrid_box, box_start, stride, k, A_smatrix_dbox, box_start, stride, iA); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,k,iA) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(k, iA) { diff -Nru hypre-2.11.2/src/sstruct_ls/fac_zero_stencilcoef.c hypre-2.13.0/src/sstruct_ls/fac_zero_stencilcoef.c --- hypre-2.11.2/src/sstruct_ls/fac_zero_stencilcoef.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/fac_zero_stencilcoef.c 2017-10-20 17:42:22.000000000 +0000 @@ -13,13 +13,13 @@ #include "_hypre_sstruct_ls.h" #include "fac.h" -#define AbsStencilShape(stencil, abs_shape) \ - { \ - HYPRE_Int ii,jj,kk; \ - ii = hypre_IndexX(stencil); \ - jj = hypre_IndexY(stencil); \ - kk = hypre_IndexZ(stencil); \ - abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \ +#define AbsStencilShape(stencil, abs_shape) \ + { \ + HYPRE_Int ii,jj,kk; \ + ii = hypre_IndexX(stencil); \ + jj = hypre_IndexY(stencil); \ + kk = hypre_IndexZ(stencil); \ + abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \ } /*-------------------------------------------------------------------------- @@ -71,7 +71,6 @@ HYPRE_Real *ac_ptr; hypre_Index loop_size; - HYPRE_Int iac; HYPRE_Int ci, i, j; HYPRE_Int abs_shape; @@ -176,7 +175,7 @@ ac_dbox, hypre_BoxIMin(shift_ibox), stride, iac); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iac) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iac) { @@ -258,7 +257,6 @@ HYPRE_Real *a_ptr; hypre_Index loop_size; - HYPRE_Int ia; HYPRE_Int fi, fj, i, j; HYPRE_Int abs_shape; HYPRE_Int myid, proc; @@ -402,7 +400,7 @@ a_dbox, hypre_BoxIMin(&intersect_box), stride, ia); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ia) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(ia) { diff -Nru hypre-2.11.2/src/sstruct_ls/HYPRE_sstruct_int.c hypre-2.13.0/src/sstruct_ls/HYPRE_sstruct_int.c --- hypre-2.11.2/src/sstruct_ls/HYPRE_sstruct_int.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/HYPRE_sstruct_int.c 2017-10-20 17:42:22.000000000 +0000 @@ -27,12 +27,12 @@ hypre_StructVector *svector; HYPRE_Int var; - srand( seed ); + hypre_SeedRand( seed ); for (var = 0; var < nvars; var++) { svector = hypre_SStructPVectorSVector(pvector, var); - seed = rand(); + seed = hypre_RandI(); hypre_StructVectorSetRandomValues(svector, seed); } @@ -47,12 +47,12 @@ hypre_SStructPVector *pvector; HYPRE_Int part; - srand( seed ); + hypre_SeedRand( seed ); for (part = 0; part < nparts; part++) { pvector = hypre_SStructVectorPVector(vector, part); - seed = rand(); + seed = hypre_RandI(); hypre_SStructPVectorSetRandomValues(pvector, seed); } diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_grad.c hypre-2.13.0/src/sstruct_ls/maxwell_grad.c --- hypre-2.11.2/src/sstruct_ls/maxwell_grad.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/maxwell_grad.c 2017-10-20 17:42:22.000000000 +0000 @@ -326,15 +326,13 @@ hypre_BoxGetSize(box_piece, loop_size); hypre_CopyIndex(hypre_BoxIMin(box_piece), start); - hypre_BoxLoop0Begin(ndim, loop_size); + hypre_SerialBoxLoop0Begin(ndim, loop_size); #if 0 /* Are private static arrays a problem? */ #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE #endif #else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop0For() { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -344,7 +342,7 @@ &rank, matrix_type); nflag[rank-start_rank1]= 0; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* if (hypre_BoxVolume(box_piece) < i) */ } /* for (m= 0; m< hypre_BoxArraySize(tmp_box_array1); m++) */ @@ -435,15 +433,13 @@ hypre_BoxGetSize(box_piece, loop_size); hypre_CopyIndex(hypre_BoxIMin(box_piece), start); - hypre_BoxLoop0Begin(ndim, loop_size); + hypre_SerialBoxLoop0Begin(ndim, loop_size); #if 0 /* Are private static arrays a problem? */ #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE #endif #else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop0For() { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -453,7 +449,7 @@ &rank, matrix_type); eflag[rank-start_rank2]= 0; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* if (hypre_BoxVolume(box_piece) < i) */ } /* for (k= 0; k< hypre_BoxArraySize(tmp_box_array1); k++) */ @@ -467,15 +463,13 @@ hypre_BoxGetSize(box_piece, loop_size); hypre_CopyIndex(hypre_BoxIMin(box_piece), start); - hypre_BoxLoop0Begin(ndim, loop_size); + hypre_SerialBoxLoop0Begin(ndim, loop_size); #if 0 /* Are private static arrays a problem? */ #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE #endif #else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop0For() { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -485,7 +479,7 @@ &rank, matrix_type); eflag[rank-start_rank2]= 0; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* if (hypre_BoxVolume(box_piece) < i) */ } /* for (k= 0; k< hypre_BoxArraySize(tmp_box_array2); k++) */ hypre_BoxArrayDestroy(tmp_box_array2); @@ -603,15 +597,13 @@ /* Interior box- loop over each edge and find the row rank and then the column ranks for the connected nodes. Change the appropriate values to 1. */ - hypre_BoxLoop0Begin(ndim, loop_size); + hypre_SerialBoxLoop0Begin(ndim, loop_size); #if 0 #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,m,i,nrows) HYPRE_SMP_SCHEDULE #endif #else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop0For() { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -641,7 +633,7 @@ ncols[nrows]= 2; nrows++; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); /* now the boundary layers. To cases to consider: is the edge totally on the boundary or is the edge connected @@ -672,15 +664,13 @@ hypre_BoxGetSize(&layer, loop_size); hypre_CopyIndex(hypre_BoxIMin(&layer), start); - hypre_BoxLoop0Begin(ndim, loop_size); + hypre_SerialBoxLoop0Begin(ndim, loop_size); #if 0 #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,m,i,nrows) HYPRE_SMP_SCHEDULE #endif #else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop0For() { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -755,7 +745,7 @@ } /* if (eflag[m-start_rank2]) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* for (ndirection= 0; ndirection< 2; ndirection++) */ } /* for (d= 0; d< ndim; d++) */ diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_physbdy.c hypre-2.13.0/src/sstruct_ls/maxwell_physbdy.c --- hypre-2.11.2/src/sstruct_ls/maxwell_physbdy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/maxwell_physbdy.c 2017-10-20 17:42:22.000000000 +0000 @@ -476,15 +476,7 @@ hypre_BoxGetSize(box, loop_size); hypre_CopyIndex(hypre_BoxIMin(box), start); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,boxman_entry,cnt) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -497,7 +489,7 @@ cnt++; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(p, box_array) */ } /* hypre_ForBoxArrayI(m, fbdry) */ diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_PNedelec.c hypre-2.13.0/src/sstruct_ls/maxwell_PNedelec.c --- hypre-2.11.2/src/sstruct_ls/maxwell_PNedelec.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/maxwell_PNedelec.c 2017-10-20 17:42:22.000000000 +0000 @@ -69,7 +69,7 @@ HYPRE_Int nvars, Edge_nvars, part, var; HYPRE_Int tot_vars= 8; - HYPRE_Int t, i, j, k, l, m, n, p, r, size; + HYPRE_Int t, i, j, k, l, m, n, p, size; HYPRE_Int ilower, iupper; HYPRE_Int jlower, jupper; @@ -538,16 +538,8 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, stride, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,entry,p,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, stride, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -568,7 +560,7 @@ j++; } } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI */ @@ -663,16 +655,8 @@ /* increase the loop_size by one in the Z plane direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -711,7 +695,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y_Face */ hypre_CopyBox(cellbox, ©_box); @@ -744,16 +728,9 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -788,7 +765,7 @@ } /* for (n= 1; n< rfactor[2]; n++) */ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -847,16 +824,9 @@ /* increase the loop_size by one in the Z_Face direction to cover upper boundary Z_Faces. */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -893,7 +863,7 @@ } /* for (n= 1; n< rfactor[0]; n++) */ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* X_Face */ hypre_CopyBox(cellbox, ©_box); @@ -927,16 +897,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -973,7 +936,7 @@ } /* for (n= 1; n< rfactor[2]; n++) */ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -1031,16 +994,9 @@ /* increase the loop_size by one in the X_Face direction */ loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1076,7 +1032,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y_Face */ hypre_CopyBox(cellbox, ©_box); @@ -1109,16 +1065,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1154,7 +1103,7 @@ } /* for (n= 1; n< rfactor[0]; n++) */ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -1204,16 +1153,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1245,7 +1186,7 @@ } /* for (p= 1; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -1268,16 +1209,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1311,7 +1244,7 @@ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -1334,16 +1267,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1386,7 +1311,7 @@ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -1409,16 +1334,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1461,7 +1378,7 @@ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -1485,16 +1402,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1536,7 +1445,7 @@ var_index[0]-= (rfactor[0]-1); } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -1727,16 +1636,8 @@ /* note that the correct cbox corresponding to this non-vanishing fbox is used. */ - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, stride, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,j,entry,p,cindex,l,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, stride, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1774,8 +1675,7 @@ k++; } /* if ((p <= upper_ranks[part][t]) && (p >= lower_ranks[part][t])) */ } - - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI */ hypre_TFree(boxoffset); @@ -1871,16 +1771,9 @@ /* increase the loop_size by one in the Z plane direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -1943,7 +1836,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y plane direction */ hypre_CopyIndex(Edge_cstarts[part][i], cstart); @@ -1980,16 +1873,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2048,7 +1934,7 @@ } /* for (n= 1; n< rfactor[2]; n++) */ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2113,16 +1999,9 @@ /* increase the loop_size by one in the Z plane direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2183,7 +2062,7 @@ } /* for (n= 1; n< rfactor[0]; n++) */ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* X_Face */ hypre_CopyBox(cellbox, ©_box); @@ -2221,16 +2100,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2289,7 +2161,7 @@ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2350,16 +2222,9 @@ /* increase the loop_size by one in the X plane direction */ loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2420,7 +2285,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y plane */ hypre_CopyBox(cellbox, ©_box); @@ -2457,16 +2322,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2525,7 +2383,7 @@ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2576,16 +2434,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2627,7 +2477,7 @@ } /* for (p= 1; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2652,16 +2502,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2703,7 +2545,7 @@ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2728,16 +2570,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2806,7 +2640,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2831,16 +2665,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2909,7 +2735,7 @@ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -2934,16 +2760,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3011,7 +2829,7 @@ } /* for (n= 1; n< rfactor[0]; n++) */ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; } diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_semi_interp.c hypre-2.13.0/src/sstruct_ls/maxwell_semi_interp.c --- hypre-2.11.2/src/sstruct_ls/maxwell_semi_interp.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/maxwell_semi_interp.c 2017-10-20 17:42:22.000000000 +0000 @@ -48,7 +48,7 @@ HYPRE_Int hypre_DestroyPTopology(void *PTopology_vdata) { - hypre_PTopology *PTopology= (hypre_PTopology *)PTopology_vdata; + hypre_PTopology *PTopology= (hypre_PTopology *)PTopology_vdata; HYPRE_Int ierr = 0; if (PTopology) @@ -148,7 +148,7 @@ HYPRE_Int nvars, Face_nvars, Edge_nvars, part, var, box, fboxi; HYPRE_Int tot_vars= 8; - HYPRE_Int t, i, j, k, l, m, n, p, r; + HYPRE_Int t, i, j, k, l, m, n, p; HYPRE_Int ilower, iupper; HYPRE_Int jlower, jupper; @@ -1000,15 +1000,7 @@ /* loop over each cell and find the row rank of Element_edge and then the column ranks of the connected fine edges. */ - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,findex,entry,rank,nElements,low_index,t,hi_index,var,m,k,j,var_index,nElements_iedges) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -1122,7 +1114,7 @@ } /* for (t= 0; t< Face_nvars; t++) */ } /* if (ndim == 2) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ } /* for (part= 0; part< nparts; part++) */ @@ -1277,15 +1269,7 @@ hypre_ClearIndex(stride); hypre_CopyIndex(upper_shifts[part][fboxi], stride); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -1354,7 +1338,7 @@ (rank >= clower_ranks[part][var])) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ break; } /* case 2: x_Faces-> y_iedges, z_iedges */ @@ -1382,15 +1366,7 @@ hypre_ClearIndex(stride); hypre_CopyIndex(upper_shifts[part][fboxi], stride); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -1458,7 +1434,7 @@ (rank >= clower_ranks[part][var])) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ break; } /* case 3: y_Faces-> x_iedges, z_iedges */ @@ -1486,15 +1462,7 @@ hypre_ClearIndex(stride); hypre_CopyIndex(upper_shifts[part][fboxi], stride); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -1562,7 +1530,7 @@ } /* if ((rank <= cupper_ranks[part][var]) && (rank >= clower_ranks[part][var])) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ break; } /* case 4: z_Faces-> x_iedges, y_iedges */ @@ -1713,15 +1681,7 @@ hypre_ClearIndex(stride); hypre_CopyIndex(upper_shifts[part][fboxi], stride); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,entry,rank,nEdges,cell_index,findex,j,var_index,m,entry,rank,nEdges_iedges) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -1852,7 +1812,7 @@ } /* if ((rank <= cupper_ranks[part][var]) && (rank >= clower_ranks[part][var])) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ } /* for (t= 0; t< Edge_nvars; t++) */ @@ -1935,15 +1895,7 @@ hypre_BoxGetSize(cbox, loop_size); hypre_CopyIndex(hypre_BoxIMin(cbox), start); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,t,var,entry,rank,nElements_Faces,var_index) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -1973,7 +1925,7 @@ } } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ } /* if (ndim == 3) */ @@ -1994,15 +1946,7 @@ hypre_BoxGetSize(cbox, loop_size); hypre_CopyIndex(hypre_BoxIMin(cbox), start); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,t,var,entry,rank,nElements_Edges,var_index) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]); @@ -2161,7 +2105,7 @@ } /* switch (var) */ } /* for (t= 0; t< Edge_nvars; t++) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* hypre_ForBoxI(i, cboxes) */ } /* for (part= 0; part< nparts; part++) */ @@ -2368,16 +2312,8 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, stride, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, stride, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2398,7 +2334,7 @@ j++; } } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI */ hypre_TFree(boxoffset); @@ -2499,16 +2435,8 @@ /* increase the loop_size by one in the Z plane direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2549,7 +2477,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y_Face */ hypre_CopyBox(cellbox, ©_box); @@ -2582,16 +2510,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2631,7 +2552,7 @@ } /* for (n= 1; n< rfactor[2]; n++) */ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -2690,16 +2611,9 @@ /* reset and then increase the loop_size by one in the Z_Face direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2738,7 +2652,7 @@ } /* for (n= 1; n< rfactor[0]; n++) */ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* X_Face */ hypre_CopyBox(cellbox, ©_box); @@ -2772,16 +2686,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2820,7 +2727,7 @@ } /* for (n= 1; n< rfactor[2]; n++) */ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -2879,16 +2786,9 @@ /* increase the loop_size by one in the X_Face direction */ loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -2930,7 +2830,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y_Face */ hypre_CopyBox(cellbox, ©_box); @@ -2964,16 +2864,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3014,7 +2907,7 @@ } /* for (n= 1; n< rfactor[0]; n++) */ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -3064,16 +2957,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3105,7 +2990,7 @@ } /* for (p= 1; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -3128,16 +3013,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3170,7 +3047,7 @@ } /* for (n= 0; n< rfactor[0]; n++) */ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -3193,16 +3070,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3245,7 +3114,7 @@ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -3268,16 +3137,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3320,7 +3181,7 @@ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -3344,16 +3205,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3396,7 +3249,7 @@ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -3596,16 +3449,8 @@ /* note that the correct cbox corresponding to this non-vanishing fbox is used. */ - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, stride, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,j,entry,cindex,var_index,rank,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, stride, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3643,7 +3488,7 @@ k++; } } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI */ hypre_TFree(boxoffset); hypre_TFree(suboffset); @@ -3754,16 +3599,9 @@ /* increase the loop_size by one in the Z plane direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -3862,7 +3700,7 @@ } } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y_Face */ hypre_CopyIndex(Edge_cstarts[part][i], cstart); @@ -3899,16 +3737,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4006,7 +3837,7 @@ } /* for (p= 0; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -4070,16 +3901,9 @@ /* increase the loop_size by one in the Z plane direction */ loop_size[2]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4172,7 +3996,7 @@ } } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* X_Face */ hypre_CopyBox(cellbox, ©_box); @@ -4210,16 +4034,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4317,7 +4134,7 @@ } /* for (p= 0; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -4381,16 +4198,9 @@ /* increase the loop_size by one in the X plane direction */ loop_size[0]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4483,7 +4293,7 @@ } } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); /* Y_Face */ hypre_CopyBox(cellbox, ©_box); @@ -4520,16 +4330,9 @@ hypre_CopyIndex(hypre_BoxIMin(©_box), start); loop_size[1]++; - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, m); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(m) + + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, m); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4627,7 +4430,7 @@ } /* for (p= 0; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(m); + hypre_SerialBoxLoop1End(m); } /* hypre_ForBoxI(i, fboxes) */ break; } @@ -4685,16 +4488,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4751,7 +4546,7 @@ } /* for (p= 1; p< rfactor[0]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -4776,16 +4571,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -4842,7 +4629,7 @@ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -4867,16 +4654,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -5004,7 +4783,7 @@ } /* for (n= 1; n< rfactor[1]; n++) */ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -5029,16 +4808,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -5167,7 +4938,7 @@ } /* for (p= 1; p< rfactor[2]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; @@ -5192,16 +4963,8 @@ loop_size); hypre_CopyIndex(hypre_BoxIMin(©_box), start); - hypre_BoxLoop1Begin(ndim, loop_size, - ©_box, start, rfactor, r); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop1For(r) + hypre_SerialBoxLoop1Begin(ndim, loop_size, + ©_box, start, rfactor, r); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]); @@ -5330,7 +5093,7 @@ } /* for (p= 1; p< rfactor[1]; p++) */ } - hypre_BoxLoop1End(r); + hypre_SerialBoxLoop1End(r); } /* hypre_ForBoxI(i, fboxes) */ break; diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_TV_setup.c hypre-2.13.0/src/sstruct_ls/maxwell_TV_setup.c --- hypre-2.11.2/src/sstruct_ls/maxwell_TV_setup.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/maxwell_TV_setup.c 2017-10-20 17:42:22.000000000 +0000 @@ -32,7 +32,7 @@ hypre_SStructVector *b_in, hypre_SStructVector *x_in) { - hypre_MaxwellData *maxwell_TV_data = (hypre_MaxwellData *)maxwell_vdata; + hypre_MaxwellData *maxwell_TV_data = (hypre_MaxwellData *)maxwell_vdata; MPI_Comm comm = hypre_SStructMatrixComm(Aee_in); @@ -387,13 +387,12 @@ hypre_BoxGetSize(box_piece, loop_size); hypre_CopyIndex(hypre_BoxIMin(box_piece), start); - hypre_BoxLoop0Begin(ndim, loop_size); + hypre_SerialBoxLoop0Begin(ndim, loop_size); #if 0 /* Are private static arrays a problem? */ #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE #endif #endif - hypre_BoxLoop0For() { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -404,7 +403,7 @@ flag[rank-start_rank] = 0; flag2[rank-start_rank]= rank; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* if (hypre_BoxVolume(box_piece) < i) */ } /* for (m= 0; m< hypre_BoxArraySize(tmp_box_array); m++) */ hypre_BoxArrayDestroy(tmp_box_array); @@ -490,7 +489,7 @@ hypre_SStructVectorParVector(bn), hypre_SStructVectorParVector(xn)); { - amg_data = (hypre_ParAMGData*) amg_vdata; + amg_data = (hypre_ParAMGData*) amg_vdata; node_numlevels= hypre_ParAMGDataNumLevels(amg_data); diff -Nru hypre-2.11.2/src/sstruct_ls/node_relax.c hypre-2.13.0/src/sstruct_ls/node_relax.c --- hypre-2.11.2/src/sstruct_ls/node_relax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/node_relax.c 2017-10-20 17:42:22.000000000 +0000 @@ -51,7 +51,7 @@ HYPRE_Real **A_loc; HYPRE_Real *x_loc; - /* pointers for vector and matrix data */ + /* pointers for vector and matrix data */ HYPRE_Real ***Ap; HYPRE_Real **bp; HYPRE_Real **xp; @@ -118,7 +118,7 @@ HYPRE_Int hypre_NodeRelaxDestroy( void *relax_vdata ) { - hypre_NodeRelaxData *relax_data = (hypre_NodeRelaxData *)relax_vdata; + hypre_NodeRelaxData *relax_data = (hypre_NodeRelaxData *)relax_vdata; HYPRE_Int i,vi; HYPRE_Int nvars; @@ -147,8 +147,8 @@ hypre_TFree(relax_data -> compute_pkgs); hypre_SStructPVectorDestroy(relax_data -> t); - hypre_TFree(relax_data -> x_loc); - hypre_TFree((relax_data ->A_loc)[0]); + hypre_UMTFree(relax_data -> x_loc); + hypre_UMTFree((relax_data ->A_loc)[0]); hypre_TFree(relax_data -> A_loc); hypre_TFree(relax_data -> bp); hypre_TFree(relax_data -> xp); @@ -277,9 +277,9 @@ * Allocate storage used to invert local diagonal blocks *----------------------------------------------------------*/ - x_loc = hypre_TAlloc(HYPRE_Real , hypre_NumThreads()*nvars); + x_loc = hypre_UMTAlloc(HYPRE_Real , hypre_NumThreads()*nvars); A_loc = hypre_TAlloc(HYPRE_Real *, hypre_NumThreads()*nvars); - A_loc[0] = hypre_TAlloc(HYPRE_Real , hypre_NumThreads()*nvars*nvars); + A_loc[0] = hypre_UMTAlloc(HYPRE_Real , hypre_NumThreads()*nvars*nvars); for (vi = 1; vi < hypre_NumThreads()*nvars; vi++) { A_loc[vi] = A_loc[0] + vi*nvars; @@ -564,15 +564,8 @@ hypre_Box *x_data_box; hypre_Box *t_data_box; - HYPRE_Int Ai; - HYPRE_Int bi; - HYPRE_Int xi; - HYPRE_Int ti; - HYPRE_Real **tA_loc = (relax_data -> A_loc); HYPRE_Real *tx_loc = (relax_data -> x_loc); - HYPRE_Real **A_loc; - HYPRE_Real *x_loc; HYPRE_Real ***Ap = (relax_data -> Ap); HYPRE_Real **bp = (relax_data -> bp); @@ -704,12 +697,14 @@ b_data_box, start, stride, bi, x_data_box, start, stride, xi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,bi,xi,vi,vj,x_loc,A_loc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, bi, xi) { - A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars]; - x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars]; + HYPRE_Real **A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars]; + HYPRE_Real *x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars]; + HYPRE_Int vi, vj; + /*------------------------------------------------ * Copy rhs and matrix for diagonal coupling * (intra-nodal) into local storage. @@ -840,6 +835,7 @@ #endif hypre_BoxLoop2For(bi, ti) { + HYPRE_Int vi; /* Copy rhs into temp vector */ for (vi = 0; vi < nvars; vi++) { @@ -904,12 +900,14 @@ A_data_box, start, stride, Ai, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,ti,vi,vj,x_loc,A_loc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, ti) { - A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars]; - x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars]; + HYPRE_Real **A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars]; + HYPRE_Real *x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars]; + HYPRE_Int vi, vj; + /*------------------------------------------------ * Copy rhs and matrix for diagonal coupling * (intra-nodal) into local storage. diff -Nru hypre-2.11.2/src/sstruct_ls/sstruct_sharedDOFComm.c hypre-2.13.0/src/sstruct_ls/sstruct_sharedDOFComm.c --- hypre-2.11.2/src/sstruct_ls/sstruct_sharedDOFComm.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_ls/sstruct_sharedDOFComm.c 2017-10-20 17:42:22.000000000 +0000 @@ -48,7 +48,7 @@ HYPRE_Int hypre_MaxwellOffProcRowDestroy(void *OffProcRow_vdata) { - hypre_MaxwellOffProcRow *OffProcRow= (hypre_MaxwellOffProcRow *)OffProcRow_vdata; + hypre_MaxwellOffProcRow *OffProcRow= (hypre_MaxwellOffProcRow *)OffProcRow_vdata; HYPRE_Int ierr= 0; if (OffProcRow) @@ -689,15 +689,7 @@ hypre_BoxGetSize(&boxman_entry_box, loop_size); hypre_CopyIndex(hypre_BoxIMin(&boxman_entry_box), start); - hypre_BoxLoop0Begin(ndim, loop_size); -#if 0 -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,rank,tot_nsendRowsNcols,n,col_inds,values,send_ColsData_alloc,k,tot_sendColsData) HYPRE_SMP_SCHEDULE -#endif -#else - hypre_BoxLoopSetOneBlock(); -#endif - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(lindex); hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]); @@ -742,7 +734,7 @@ } /* if (rank <= end_rank && rank >= start_rank) */ } /* if (entry) */ } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } /* if (proc != myproc) */ } /* for (m= 0; m< nboxman_entries; m++) */ diff -Nru hypre-2.11.2/src/sstruct_mv/headers hypre-2.13.0/src/sstruct_mv/headers --- hypre-2.11.2/src/sstruct_mv/headers 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_mv/headers 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,9 @@ cat > $INTERNAL_HEADER <<@ +/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/ + + #ifndef hypre_SSTRUCT_MV_HEADER #define hypre_SSTRUCT_MV_HEADER diff -Nru hypre-2.11.2/src/sstruct_mv/_hypre_sstruct_mv.h hypre-2.13.0/src/sstruct_mv/_hypre_sstruct_mv.h --- hypre-2.11.2/src/sstruct_mv/_hypre_sstruct_mv.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_mv/_hypre_sstruct_mv.h 2017-10-20 17:42:22.000000000 +0000 @@ -1,4 +1,7 @@ +/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/ + + #ifndef hypre_SSTRUCT_MV_HEADER #define hypre_SSTRUCT_MV_HEADER diff -Nru hypre-2.11.2/src/sstruct_mv/sstruct_matrix.c hypre-2.13.0/src/sstruct_mv/sstruct_matrix.c --- hypre-2.11.2/src/sstruct_mv/sstruct_matrix.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_mv/sstruct_matrix.c 2017-10-20 17:42:22.000000000 +0000 @@ -728,7 +728,7 @@ box = hypre_BoxArrayBox(boxes, b); hypre_CopyBox(box, ghost_box); if (matrix_type == HYPRE_SSTRUCT || matrix_type == HYPRE_STRUCT) - { + { hypre_BoxGrowByArray(ghost_box, hypre_StructGridNumGhost(sgrid)); } start = hypre_BoxIMin(box); @@ -963,7 +963,7 @@ hypre_IndexRef start; hypre_Index rs, cs; HYPRE_Int row_base, col_base; - HYPRE_Int d, ei, entry, ii, jj, i, mi, vi; + HYPRE_Int ei, entry, ii, jj, i; HYPRE_Int matrix_type = hypre_SStructMatrixObjectType(matrix); box = hypre_BoxCreate(ndim); @@ -982,7 +982,7 @@ int_box = hypre_BoxCreate(ndim); nrows = hypre_BoxVolume(vbox)*nentries; - ncols = hypre_CTAlloc(HYPRE_Int, nrows); + ncols = hypre_UMCTAlloc(HYPRE_Int, nrows); #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE #endif @@ -990,9 +990,9 @@ { ncols[i] = 1; } - rows = hypre_CTAlloc(HYPRE_Int, nrows); - cols = hypre_CTAlloc(HYPRE_Int, nrows); - ijvalues = hypre_CTAlloc(HYPRE_Complex, nrows); + rows = hypre_UMCTAlloc(HYPRE_Int, nrows); + cols = hypre_UMCTAlloc(HYPRE_Int, nrows); + ijvalues = hypre_UMCTAlloc(HYPRE_Complex, nrows); hypre_SetIndex(stride, 1); @@ -1045,14 +1045,18 @@ start = hypre_BoxIMin(int_box); hypre_BoxGetSize(int_box, loop_size); - hypre_BoxLoop2Begin(ndim, loop_size, + /*FIXME: It has to be the old boxloop */ + zypre_BoxLoop2Begin(ndim, loop_size, int_box, start, stride, mi, vbox, start, stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi,index,d) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif - hypre_BoxLoop2For(mi, vi) + zypre_BoxLoop2For(mi, vi) { + hypre_Index index; + HYPRE_Int d; + hypre_BoxLoopGetIndex(index); rows[nrows + mi] = row_base; cols[nrows + mi] = col_base; @@ -1063,7 +1067,7 @@ } ijvalues[nrows + mi] = values[ei + vi*nentries]; } - hypre_BoxLoop2End(mi, vi); + zypre_BoxLoop2End(mi, vi); nrows += hypre_BoxVolume(int_box); @@ -1099,10 +1103,10 @@ hypre_TFree(boxman_entries); - hypre_TFree(ncols); - hypre_TFree(rows); - hypre_TFree(cols); - hypre_TFree(ijvalues); + hypre_UMTFree(ncols); + hypre_UMTFree(rows); + hypre_UMTFree(cols); + hypre_UMTFree(ijvalues); hypre_BoxDestroy(to_box); hypre_BoxDestroy(map_box); @@ -1117,9 +1121,7 @@ { /* RDF: THREAD (Check safety on UMatrixSetValues call) */ hypre_BoxGetSize(vbox, loop_size); - hypre_BoxLoop0Begin(ndim, loop_size); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(index); hypre_AddIndexes(index, hypre_BoxIMin(vbox), ndim, index); @@ -1127,7 +1129,7 @@ nentries, entries, values, action); values += nentries; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } hypre_BoxDestroy(box); @@ -1383,7 +1385,7 @@ hypre_SStructBoxManInfo *frinfo, *toinfo; HYPRE_Complex *tvalues = NULL; HYPRE_Int nfrentries, ntoentries, frpart, topart; - HYPRE_Int entry, sentry, ei, fri, toi, vi, mi; + HYPRE_Int entry, sentry, ei, fri, toi; pmatrix = hypre_SStructMatrixPMatrix(matrix, part); @@ -1481,7 +1483,7 @@ ibox1, start, stride, mi, vbox, start, stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(mi, vi) { @@ -1513,7 +1515,7 @@ ibox1, start, stride, mi, vbox, start, stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(mi, vi) { diff -Nru hypre-2.11.2/src/sstruct_mv/sstruct_vector.c hypre-2.13.0/src/sstruct_mv/sstruct_vector.c --- hypre-2.11.2/src/sstruct_mv/sstruct_vector.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/sstruct_mv/sstruct_vector.c 2017-10-20 17:42:22.000000000 +0000 @@ -614,11 +614,9 @@ hypre_SStructPVector *pvector; hypre_StructVector *y; hypre_Box *y_data_box; - HYPRE_Int yi; HYPRE_Complex *yp; hypre_BoxArray *boxes; hypre_Box *box; - HYPRE_Int bi; hypre_Index loop_size; hypre_IndexRef start; hypre_Index stride; @@ -655,7 +653,7 @@ y_data_box, start, stride, yi, box, start, stride, bi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,bi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(yi, bi) { @@ -698,11 +696,9 @@ hypre_SStructPVector *pvector; hypre_StructVector *y; hypre_Box *y_data_box; - HYPRE_Int yi; HYPRE_Complex *yp; hypre_BoxArray *boxes; hypre_Box *box; - HYPRE_Int bi; hypre_Index loop_size; hypre_IndexRef start; hypre_Index stride; @@ -741,7 +737,7 @@ y_data_box, start, stride, yi, box, start, stride, bi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,bi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(yi, bi) { diff -Nru hypre-2.11.2/src/struct_ls/cyclic_reduction.c hypre-2.13.0/src/struct_ls/cyclic_reduction.c --- hypre-2.11.2/src/struct_ls/cyclic_reduction.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/cyclic_reduction.c 2017-10-20 17:42:22.000000000 +0000 @@ -26,7 +26,7 @@ #define hypre_CycRedSetCIndex(base_index, base_stride, level, cdir, cindex) \ { \ if (level > 0) \ - hypre_SetIndex3(cindex, 0, 0, 0); \ + hypre_SetIndex3(cindex, 0, 0, 0); \ else \ hypre_CopyIndex(base_index, cindex); \ hypre_IndexD(cindex, cdir) += 0; \ @@ -35,7 +35,7 @@ #define hypre_CycRedSetFIndex(base_index, base_stride, level, cdir, findex) \ { \ if (level > 0) \ - hypre_SetIndex3(findex, 0, 0, 0); \ + hypre_SetIndex3(findex, 0, 0, 0); \ else \ hypre_CopyIndex(base_index, findex); \ hypre_IndexD(findex, cdir) += 1; \ @@ -44,7 +44,7 @@ #define hypre_CycRedSetStride(base_index, base_stride, level, cdir, stride) \ { \ if (level > 0) \ - hypre_SetIndex3(stride, 1, 1, 1); \ + hypre_SetIndex3(stride, 1, 1, 1); \ else \ hypre_CopyIndex(base_stride, stride); \ hypre_IndexD(stride, cdir) *= 2; \ @@ -238,11 +238,8 @@ HYPRE_Real *a_cc, *a_cw, *a_ce; HYPRE_Real *ac_cc, *ac_cw, *ac_ce; - - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int offsetA; + HYPRE_Int offsetA; stridef = cstride; hypre_SetIndex3(stridec, 1, 1, 1); @@ -334,12 +331,12 @@ A_dbox, fstart, stridef, iA, Ac_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,iAm1,iAp1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { - iAm1 = iA - offsetA; - iAp1 = iA + offsetA; + HYPRE_Int iAm1 = iA - offsetA; + HYPRE_Int iAp1 = iA + offsetA; ac_cw[iAc] = - a_cw[iA] *a_cw[iAm1] / a_cc[iAm1]; @@ -365,12 +362,12 @@ A_dbox, fstart, stridef, iA, Ac_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,iAm1,iAp1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { - iAm1 = iA - offsetA; - iAp1 = iA + offsetA; + HYPRE_Int iAm1 = iA - offsetA; + HYPRE_Int iAp1 = iA + offsetA; ac_cw[iAc] = - a_cw[iA] *a_cw[iAm1] / a_cc[iAm1]; @@ -431,7 +428,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size, Ac_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { @@ -453,7 +450,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size, Ac_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { @@ -482,7 +479,7 @@ hypre_StructVector *b, hypre_StructVector *x ) { - hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *) cyc_red_vdata; + hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *) cyc_red_vdata; MPI_Comm comm = (cyc_red_data -> comm); HYPRE_Int cdir = (cyc_red_data -> cdir); @@ -594,7 +591,7 @@ /*----------------------------------------------------- * Set up matrix and vector structures *-----------------------------------------------------*/ - + A_l = hypre_TAlloc(hypre_StructMatrix *, num_levels); x_l = hypre_TAlloc(hypre_StructVector *, num_levels); @@ -615,8 +612,9 @@ data_size += hypre_StructVectorDataSize(x_l[l+1]); } - data = hypre_SharedCTAlloc(HYPRE_Real, data_size); - + //data = hypre_SharedCTAlloc(HYPRE_Real, data_size); + data = hypre_DeviceCTAlloc(HYPRE_Real,data_size); + (cyc_red_data -> data) = data; for (l = 0; l < (num_levels - 1); l++) @@ -739,7 +737,7 @@ hypre_StructVector *b, hypre_StructVector *x ) { - hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; + hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; HYPRE_Int num_levels = (cyc_red_data -> num_levels); HYPRE_Int cdir = (cyc_red_data -> cdir); @@ -773,11 +771,6 @@ HYPRE_Real *xp, *xwp, *xep; HYPRE_Real *bp; HYPRE_Real *xcp; - - HYPRE_Int Ai; - HYPRE_Int xi; - HYPRE_Int bi; - HYPRE_Int xci; hypre_Index cindex; hypre_Index stride; @@ -821,12 +814,12 @@ hypre_CopyIndex(hypre_BoxIMin(compute_box), start); hypre_BoxGetStrideSize(compute_box, base_stride, loop_size); - + hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, x_dbox, start, base_stride, xi, b_dbox, start, base_stride, bi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,bi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, bi) { @@ -879,7 +872,7 @@ A_dbox, start, stride, Ai, x_dbox, start, stride, xi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, xi) { @@ -952,13 +945,13 @@ hypre_StructMapFineToCoarse(start, cindex, stride, startc); hypre_BoxGetStrideSize(compute_box, stride, loop_size); - + hypre_BoxLoop3Begin(hypre_StructVectorNDim(x), loop_size, A_dbox, start, stride, Ai, x_dbox, start, stride, xi, xc_dbox, startc, stridec, xci); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,xci) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, xci) { @@ -1003,7 +996,7 @@ A_dbox, start, stride, Ai, x_dbox, start, stride, xi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, xi) { @@ -1128,7 +1121,7 @@ A_dbox, start, stride, Ai, x_dbox, start, stride, xi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, xi) { @@ -1159,7 +1152,7 @@ hypre_Index base_index, hypre_Index base_stride ) { - hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; + hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; HYPRE_Int d; for (d = 0; d < 3; d++) @@ -1181,7 +1174,7 @@ hypre_CyclicReductionSetCDir( void *cyc_red_vdata, HYPRE_Int cdir ) { - hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; + hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; (cyc_red_data -> cdir) = cdir; @@ -1195,7 +1188,7 @@ HYPRE_Int hypre_CyclicReductionDestroy( void *cyc_red_vdata ) { - hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; + hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata; HYPRE_Int l; @@ -1215,7 +1208,7 @@ hypre_ComputePkgDestroy(cyc_red_data -> up_compute_pkg_l[l]); } hypre_BoxArrayDestroy(cyc_red_data -> fine_points_l[l]); - hypre_SharedTFree(cyc_red_data -> data); + hypre_DeviceTFree(cyc_red_data -> data); hypre_TFree(cyc_red_data -> grid_l); hypre_TFree(cyc_red_data -> fine_points_l); hypre_TFree(cyc_red_data -> A_l); diff -Nru hypre-2.11.2/src/struct_ls/HYPRE_struct_int.c hypre-2.13.0/src/struct_ls/HYPRE_struct_int.c --- hypre-2.11.2/src/struct_ls/HYPRE_struct_int.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/HYPRE_struct_int.c 2017-10-20 17:42:22.000000000 +0000 @@ -19,7 +19,6 @@ { hypre_Box *v_data_box; - HYPRE_Int vi; HYPRE_Real *vp; hypre_BoxArray *boxes; @@ -34,7 +33,8 @@ * Set the vector coefficients *-----------------------------------------------------------------------*/ - srand( seed ); +// srand( seed ); + hypre_SeedRand(seed); hypre_SetIndex3(unit_stride, 1, 1, 1); @@ -53,11 +53,12 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(vi) { - vp[vi] = 2.0*rand()/RAND_MAX - 1.0; +// vp[vi] = 2.0*rand()/RAND_MAX - 1.0; + vp[vi] = 2.0*hypre_Rand() - 1.0; } hypre_BoxLoop1End(vi); } diff -Nru hypre-2.11.2/src/struct_ls/_hypre_struct_ls.h hypre-2.13.0/src/struct_ls/_hypre_struct_ls.h --- hypre-2.11.2/src/struct_ls/_hypre_struct_ls.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/_hypre_struct_ls.h 2017-10-20 17:42:22.000000000 +0000 @@ -32,8 +32,6 @@ #ifdef __cplusplus extern "C" { #endif - - /* coarsen.c */ HYPRE_Int hypre_StructMapFineToCoarse ( hypre_Index findex , hypre_Index index , hypre_Index stride , hypre_Index cindex ); HYPRE_Int hypre_StructMapCoarseToFine ( hypre_Index cindex , hypre_Index index , hypre_Index stride , hypre_Index findex ); diff -Nru hypre-2.11.2/src/struct_ls/HYPRE_struct_pcg.c hypre-2.13.0/src/struct_ls/HYPRE_struct_pcg.c --- hypre-2.11.2/src/struct_ls/HYPRE_struct_pcg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/HYPRE_struct_pcg.c 2017-10-20 17:42:22.000000000 +0000 @@ -203,10 +203,6 @@ HYPRE_Real *Ap; HYPRE_Real *yp; HYPRE_Real *xp; - - HYPRE_Int Ai; - HYPRE_Int yi; - HYPRE_Int xi; hypre_Index index; hypre_IndexRef start; @@ -240,7 +236,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { diff -Nru hypre-2.11.2/src/struct_ls/pfmg2_setup_rap.c hypre-2.13.0/src/struct_ls/pfmg2_setup_rap.c --- hypre-2.11.2/src/struct_ls/pfmg2_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg2_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -311,11 +311,7 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_cs; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iA, iAm1, iAp1; HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; HYPRE_Int yOffsetA, yOffsetA_diag, yOffsetA_offd; HYPRE_Int xOffsetP; @@ -475,21 +471,20 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; - - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; + + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]; - + iP1 = iP - yOffsetP; rap_cs[iAc] = rb[iR] * a_cc[iAm1] * pa[iP1] + rb[iR] * a_cs[iAm1] + a_cs[iA] * pa[iP1]; - iP1 = iP - yOffsetP + xOffsetP; rap_cse[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1]; @@ -497,7 +492,7 @@ rap_cw[iAc] = a_cw[iA] + rb[iR] * a_cw[iAm1] * pb[iP1] + ra[iR] * a_cw[iAp1] * pa[iP1]; - + rap_cc[iAc] = a_cc[iA] + rb[iR] * a_cc[iAm1] * pb[iP] + ra[iR] * a_cc[iAp1] * pa[iP] @@ -529,14 +524,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA_diag; - iAp1 = iA + yOffsetA_diag; + HYPRE_Int iAm1 = iA - yOffsetA_diag; + HYPRE_Int iAp1 = iA + yOffsetA_diag; - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1]; iP1 = iP - yOffsetP; @@ -817,11 +812,7 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_cs; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iA, iAm1, iAp1; HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; HYPRE_Int yOffsetA, yOffsetA_diag, yOffsetA_offd; HYPRE_Int xOffsetP; @@ -1000,14 +991,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1] + rb[iR] * a_csw[iAm1] + a_csw[iA] * pa[iP1]; @@ -1064,21 +1055,20 @@ a_cnw_offd = a_cnw[iA_offd]; a_cnw_offdm1 = a_cnw[iA_offdm1]; - hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA_diag; - iAp1 = iA + yOffsetA_diag; + HYPRE_Int iAm1 = iA - yOffsetA_diag; + HYPRE_Int iAp1 = iA + yOffsetA_diag; - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1] + rb[iR] * a_csw_offdm1 + a_csw_offd * pa[iP1]; @@ -1511,10 +1501,8 @@ HYPRE_Real *rap_ce, *rap_cn; HYPRE_Real *rap_cnw, *rap_cne; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; + HYPRE_Int yOffsetA, yOffsetA_diag, yOffsetA_offd; HYPRE_Int xOffsetP; HYPRE_Int yOffsetP; @@ -1668,20 +1656,21 @@ if ( constant_coefficient_A == 0 ) { /*hypre_printf("nosym 5.0.0\n");*/ + hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]; iP1 = iP + yOffsetP; @@ -1713,20 +1702,20 @@ a_ce_offd = a_ce[iA_offd]; a_ce_offdm1 = a_ce[iA_offdm1]; a_ce_offdp1 = a_ce[iA_offdp1]; - + hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAp1 = iA + yOffsetA_diag; + HYPRE_Int iAp1 = iA + yOffsetA_diag; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce_offdp1 * pb[iP1]; iP1 = iP + yOffsetP; @@ -1981,10 +1970,7 @@ HYPRE_Real *rap_ce, *rap_cn; HYPRE_Real *rap_cnw, *rap_cne; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; HYPRE_Int yOffsetA, yOffsetA_diag, yOffsetA_offd; HYPRE_Int xOffsetP; HYPRE_Int yOffsetP; @@ -2164,20 +2150,21 @@ if ( constant_coefficient_A==0 ) { /*hypre_printf("nosym 9.0.0\n");*/ + hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1] + ra[iR] * a_cne[iAp1] + a_cne[iA] * pb[iP1]; @@ -2231,14 +2218,13 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA_diag; - iAp1 = iA + yOffsetA_diag; + HYPRE_Int iAp1 = iA + yOffsetA_diag; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce_offdp1 * pb[iP1] + ra[iR] * a_cne_offdp1 + a_cne_offd * pb[iP1]; diff -Nru hypre-2.11.2/src/struct_ls/pfmg3_setup_rap.c hypre-2.13.0/src/struct_ls/pfmg3_setup_rap.c --- hypre-2.11.2/src/struct_ls/pfmg3_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg3_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -359,10 +359,7 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_cs; HYPRE_Real *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; HYPRE_Int zOffsetA; HYPRE_Int zOffsetA_diag; @@ -565,14 +562,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP; rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1]; iP1 = iP - zOffsetP - xOffsetP; @@ -632,21 +629,21 @@ a_bc_offdp1 = a_bc[iA_offdp1]; a_ac_offd = a_ac[iA_offd]; a_ac_offdm1 = a_ac[iA_offdm1]; - + hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA_diag; - iAp1 = iA + zOffsetA_diag; + HYPRE_Int iAm1 = iA - zOffsetA_diag; + HYPRE_Int iAp1 = iA + zOffsetA_diag; - iP1 = iP - zOffsetP - yOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP; rap_bs[iAc] = rb[iR] * a_cs_offdm1 * pa[iP1]; iP1 = iP - zOffsetP - xOffsetP; @@ -656,7 +653,7 @@ rap_bc[iAc] = a_bc_offd * pa[iP1] + rb[iR] * a_cc[iAm1] * pa[iP1] + rb[iR] * a_bc_offdm1; - + iP1 = iP - zOffsetP + xOffsetP; rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1]; @@ -1015,11 +1012,8 @@ HYPRE_Real *rap_csw, *rap_cse; HYPRE_Real *rap_bsw, *rap_bse, *rap_bnw, *rap_bne; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; - + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; + HYPRE_Int zOffsetA; HYPRE_Int zOffsetA_diag; HYPRE_Int zOffsetA_offd; @@ -1305,14 +1299,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]; iP1 = iP - zOffsetP - yOffsetP; @@ -1327,7 +1321,7 @@ rap_bw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1] + rb[iR] * a_bw[iAm1] + a_bw[iA] * pa[iP1]; - + iP1 = iP - zOffsetP; rap_bc[iAc] = a_bc[iA] * pa[iP1] + rb[iR] * a_cc[iAm1] * pa[iP1] @@ -1348,7 +1342,7 @@ iP1 = iP - zOffsetP + yOffsetP + xOffsetP; rap_bne[iAc] = rb[iR] * a_cne[iAm1] * pa[iP1]; - + iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = a_csw[iA] + rb[iR] * a_csw[iAm1] * pb[iP1] @@ -1435,14 +1429,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA_diag; - iAp1 = iA + zOffsetA_diag; + HYPRE_Int iAm1 = iA - zOffsetA_diag; + HYPRE_Int iAp1 = iA + zOffsetA_diag; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw_offdm1 * pa[iP1]; iP1 = iP - zOffsetP - yOffsetP; @@ -1457,12 +1451,12 @@ rap_bw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1] + rb[iR] * a_bw_offdm1 + a_bw_offd * pa[iP1]; - + iP1 = iP - zOffsetP; rap_bc[iAc] = a_bc_offd * pa[iP1] + rb[iR] * a_cc[iAm1] * pa[iP1] + rb[iR] * a_bc_offdm1; - + iP1 = iP - zOffsetP + xOffsetP; rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1] + rb[iR] * a_be_offdm1 @@ -1483,7 +1477,7 @@ rap_csw[iAc] = a_csw_offd + rb[iR] * a_csw_offdm1 * pb[iP1] + ra[iR] * a_csw_offdp1 * pa[iP1]; - + iP1 = iP - yOffsetP; rap_cs[iAc] = a_cs_offd + rb[iR] * a_cs_offdm1 * pb[iP1] @@ -1975,10 +1969,8 @@ HYPRE_Real *rap_csw, *rap_cse; HYPRE_Real *rap_bsw, *rap_bse, *rap_bnw, *rap_bne; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; + HYPRE_Int zOffsetA; HYPRE_Int zOffsetA_diag; @@ -2302,14 +2294,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1] + rb[iR] * a_bsw[iAm1] + a_bsw[iA] * pa[iP1]; @@ -2333,7 +2325,7 @@ rap_bc[iAc] = a_bc[iA] * pa[iP1] + rb[iR] * a_cc[iAm1] * pa[iP1] + rb[iR] * a_bc[iAm1]; - + iP1 = iP - zOffsetP + xOffsetP; rap_be[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1] + rb[iR] * a_be[iAm1] @@ -2362,7 +2354,7 @@ + a_asw[iA] * pa[iP1] + rb[iR] * a_asw[iAm1] + ra[iR] * a_bsw[iAp1]; - + iP1 = iP - yOffsetP; rap_cs[iAc] = a_cs[iA] + rb[iR] * a_cs[iAm1] * pb[iP1] @@ -2461,14 +2453,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA_diag; - iAp1 = iA + zOffsetA_diag; + HYPRE_Int iAm1 = iA - zOffsetA_diag; + HYPRE_Int iAp1 = iA + zOffsetA_diag; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw_offdm1 * pa[iP1] + rb[iR] * a_bsw_offdm1 + a_bsw_offd * pa[iP1]; @@ -2492,7 +2484,7 @@ rap_bc[iAc] = a_bc_offd * pa[iP1] + rb[iR] * a_cc[iAm1] * pa[iP1] + rb[iR] * a_bc_offdm1; - + iP1 = iP - zOffsetP + xOffsetP; rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1] + rb[iR] * a_be_offdm1 @@ -2512,7 +2504,7 @@ rap_bne[iAc] = rb[iR] * a_cne_offdm1 * pa[iP1] + rb[iR] * a_bne_offdm1 + a_bne_offd * pa[iP1]; - + iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = a_csw_offd + rb[iR] * a_csw_offdm1 * pb[iP1] @@ -2530,7 +2522,7 @@ + a_as_offd * pa[iP1] + rb[iR] * a_as_offdm1 + ra[iR] * a_bs_offdp1; - + iP1 = iP - yOffsetP + xOffsetP; rap_cse[iAc] = a_cse_offd + rb[iR] * a_cse_offdm1 * pb[iP1] @@ -2539,7 +2531,7 @@ + a_ase_offd * pa[iP1] + rb[iR] * a_ase_offdm1 + ra[iR] * a_bse_offdp1; - + iP1 = iP - xOffsetP; rap_cw[iAc] = a_cw_offd + rb[iR] * a_cw_offdm1 * pb[iP1] @@ -3179,10 +3171,8 @@ HYPRE_Real *rap_ce, *rap_cn; HYPRE_Real *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an; HYPRE_Real *rap_cnw, *rap_cne; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; + HYPRE_Int zOffsetA; HYPRE_Int zOffsetA_diag; HYPRE_Int zOffsetA_offd; @@ -3382,42 +3372,42 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; - - iP1 = iP + zOffsetP + yOffsetP; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; + + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP; rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1]; iP1 = iP + zOffsetP + xOffsetP; rap_ae[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]; - + iP1 = iP + zOffsetP; rap_ac[iAc] = a_ac[iA] * pb[iP1] + ra[iR] * a_cc[iAp1] * pb[iP1] + ra[iR] * a_ac[iAp1]; - + iP1 = iP + zOffsetP - xOffsetP; rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]; - + iP1 = iP + zOffsetP - yOffsetP; rap_as[iAc] = ra[iR] * a_cs[iAp1] * pb[iP1]; - + iP1 = iP + yOffsetP; rap_cn[iAc] = a_cn[iA] + rb[iR] * a_cn[iAm1] * pb[iP1] + ra[iR] * a_cn[iAp1] * pa[iP1]; - + iP1 = iP + xOffsetP; rap_ce[iAc] = a_ce[iA] + rb[iR] * a_ce[iAm1] * pb[iP1] + ra[iR] * a_ce[iAp1] * pa[iP1]; - + rap_cnw[iAc] = 0.0; - + rap_cne[iAc] = 0.0; } hypre_BoxLoop4End(iP, iR, iA, iAc); @@ -3444,14 +3434,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA_diag; - iAp1 = iA + zOffsetA_diag; + //HYPRE_Int iAm1 = iA - zOffsetA_diag; + HYPRE_Int iAp1 = iA + zOffsetA_diag; - iP1 = iP + zOffsetP + yOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP; rap_an[iAc] = ra[iR] * a_cn_offdp1 * pb[iP1]; iP1 = iP + zOffsetP + xOffsetP; @@ -3785,10 +3775,7 @@ HYPRE_Real *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an; HYPRE_Real *rap_cnw, *rap_cne; HYPRE_Real *rap_asw, *rap_ase, *rap_anw, *rap_ane; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; HYPRE_Int zOffsetA; HYPRE_Int zOffsetA_diag; HYPRE_Int zOffsetA_offd; @@ -4068,20 +4055,21 @@ if ( constant_coefficient_A == 0 ) { + hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]; iP1 = iP + zOffsetP + yOffsetP; @@ -4106,7 +4094,7 @@ rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1] + ra[iR] * a_aw[iAp1] + a_aw[iA] * pb[iP1]; - + iP1 = iP + zOffsetP - yOffsetP + xOffsetP; rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1]; @@ -4193,14 +4181,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA_diag; - iAp1 = iA + zOffsetA_diag; + //HYPRE_Int iAm1 = iA - zOffsetA_diag; + HYPRE_Int iAp1 = iA + zOffsetA_diag; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne_offdp1 * pb[iP1]; iP1 = iP + zOffsetP + yOffsetP; @@ -4233,7 +4221,7 @@ rap_as[iAc] = ra[iR] * a_cs_offdp1 * pb[iP1] + ra[iR] * a_as_offdp1 + a_as_offd * pb[iP1]; - + iP1 = iP + zOffsetP - yOffsetP - xOffsetP; rap_asw[iAc] = ra[iR] * a_csw_offdp1 * pb[iP1]; @@ -4250,7 +4238,7 @@ + a_an_offd * pa[iP1] + rb[iR] * a_an_offdm1 + ra[iR] * a_bn_offdp1; - + iP1 = iP + yOffsetP - xOffsetP; rap_cnw[iAc] = a_cnw_offd + rb[iR] * a_cnw_offdm1 * pb[iP1] @@ -4700,10 +4688,7 @@ HYPRE_Real *rap_cnw, *rap_cne; HYPRE_Real *rap_asw, *rap_ase, *rap_anw, *rap_ane; - HYPRE_Int iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; + HYPRE_Int iA_offd, iA_offdm1, iA_offdp1; HYPRE_Int zOffsetA; HYPRE_Int zOffsetA_diag; @@ -5021,20 +5006,21 @@ if ( constant_coefficient_A == 0 ) { + hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, cstart, stridec, iP, R_dbox, cstart, stridec, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1] + ra[iR] * a_ane[iAp1] + a_ane[iA] * pb[iP1]; @@ -5063,7 +5049,7 @@ rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1] + ra[iR] * a_aw[iAp1] + a_aw[iA] * pb[iP1]; - + iP1 = iP + zOffsetP - yOffsetP + xOffsetP; rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1] + ra[iR] * a_ase[iAp1] @@ -5088,7 +5074,7 @@ + a_ane[iA] * pa[iP1] + rb[iR] * a_ane[iAm1] + ra[iR] * a_bne[iAp1]; - + iP1 = iP + yOffsetP; rap_cn[iAc] = a_cn[iA] + rb[iR] * a_cn[iAm1] * pb[iP1] @@ -5177,14 +5163,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA_diag; - iAp1 = iA + zOffsetA_diag; + //HYPRE_Int iAm1 = iA - zOffsetA_diag; + HYPRE_Int iAp1 = iA + zOffsetA_diag; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne_offdp1 * pb[iP1] + ra[iR] * a_ane_offdp1 + a_ane_offd * pb[iP1]; @@ -5208,12 +5194,12 @@ rap_ac[iAc] = a_ac_offd * pb[iP1] + ra[iR] * a_cc[iAp1] * pb[iP1] + ra[iR] * a_ac_offdp1; - + iP1 = iP + zOffsetP - xOffsetP; rap_aw[iAc] = ra[iR] * a_cw_offdp1 * pb[iP1] + ra[iR] * a_aw_offdp1 + a_aw_offd * pb[iP1]; - + iP1 = iP + zOffsetP - yOffsetP + xOffsetP; rap_ase[iAc] = ra[iR] * a_cse_offdp1 * pb[iP1] + ra[iR] * a_ase_offdp1 @@ -5229,7 +5215,6 @@ + ra[iR] * a_asw_offdp1 + a_asw_offd * pb[iP1]; - iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = a_cne_offd + rb[iR] * a_cne_offdm1 * pb[iP1] @@ -5247,7 +5232,7 @@ + a_an_offd * pa[iP1] + rb[iR] * a_an_offdm1 + ra[iR] * a_bn_offdp1; - + iP1 = iP + yOffsetP - xOffsetP; rap_cnw[iAc] = a_cnw_offd + rb[iR] * a_cnw_offdm1 * pb[iP1] diff -Nru hypre-2.11.2/src/struct_ls/pfmg.c hypre-2.13.0/src/struct_ls/pfmg.c --- hypre-2.11.2/src/struct_ls/pfmg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg.c 2017-10-20 17:42:22.000000000 +0000 @@ -57,7 +57,7 @@ HYPRE_Int hypre_PFMGDestroy( void *pfmg_vdata ) { - hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata; + hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata; HYPRE_Int l; @@ -73,6 +73,9 @@ if ((pfmg_data -> num_levels) > -1) { + HYPRE_Int constant_coefficient = + hypre_StructMatrixConstantCoefficient(pfmg_data -> A_l[0]); + for (l = 0; l < (pfmg_data -> num_levels); l++) { if (pfmg_data -> active_l[l]) @@ -106,7 +109,11 @@ hypre_StructVectorDestroy(pfmg_data -> x_l[l+1]); hypre_StructVectorDestroy(pfmg_data -> tx_l[l+1]); } - hypre_SharedTFree(pfmg_data -> data); + if (constant_coefficient == 0) + {hypre_DeviceTFree(pfmg_data -> data);} + else + {hypre_UMTFree(pfmg_data -> data);} + hypre_TFree(pfmg_data -> cdir_l); hypre_TFree(pfmg_data -> active_l); hypre_TFree(pfmg_data -> grid_l); diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup.c hypre-2.13.0/src/struct_ls/pfmg_setup.c --- hypre-2.11.2/src/struct_ls/pfmg_setup.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg_setup.c 2017-10-20 17:42:22.000000000 +0000 @@ -17,19 +17,19 @@ #define hypre_PFMGSetCIndex(cdir, cindex) \ { \ - hypre_SetIndex3(cindex, 0, 0, 0); \ + hypre_SetIndex3(cindex, 0, 0, 0); \ hypre_IndexD(cindex, cdir) = 0; \ } #define hypre_PFMGSetFIndex(cdir, findex) \ { \ - hypre_SetIndex3(findex, 0, 0, 0); \ + hypre_SetIndex3(findex, 0, 0, 0); \ hypre_IndexD(findex, cdir) = 1; \ } #define hypre_PFMGSetStride(cdir, stride) \ { \ - hypre_SetIndex3(stride, 1, 1, 1); \ + hypre_SetIndex3(stride, 1, 1, 1); \ hypre_IndexD(stride, cdir) = 2; \ } @@ -42,7 +42,7 @@ hypre_StructVector *b, hypre_StructVector *x ) { - hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata; + hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata; MPI_Comm comm = (pfmg_data -> comm); @@ -104,6 +104,8 @@ HYPRE_Int b_num_ghost[] = {0, 0, 0, 0, 0, 0}; HYPRE_Int x_num_ghost[] = {1, 1, 1, 1, 1, 1}; + HYPRE_Int constant_coefficient; + #if DEBUG char filename[255]; #endif @@ -116,7 +118,8 @@ grid = hypre_StructMatrixGrid(A); ndim = hypre_StructGridNDim(grid); - + constant_coefficient = hypre_StructMatrixConstantCoefficient(A); + /* Compute a new max_levels value based on the grid */ cbox = hypre_BoxDuplicate(hypre_StructGridBoundingBox(grid)); max_levels = 1; @@ -377,7 +380,12 @@ hypre_StructVectorInitializeShell(tx_l[l+1]); } - data = hypre_SharedCTAlloc(HYPRE_Real, data_size); + //data = hypre_DeviceCTAlloc(HYPRE_Real,data_size); + if (constant_coefficient == 0) + data = hypre_DeviceCTAlloc(HYPRE_Real,data_size); + else + data = hypre_UMCTAlloc(HYPRE_Real,data_size); + (pfmg_data -> data) = data; hypre_StructVectorInitializeData(tx_l[0], data); @@ -730,6 +738,244 @@ /* constant_coefficient==0, all coefficients vary with space */ else { +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA) + /*FIXME: need reduction for more variables*/ + HYPRE_Int tmp = 0; + hypre_MatrixIndexMove(A, stencil_size, i, tmp, 3); +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcx +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:cx) + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,cx); + { + HYPRE_Int tcx = 0.0; + HYPRE_Complex *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + + /* get sign of diagonal */ + Ap = hypre_StructGetMatrixBoxData(A, i, sdiag); + //Ap = (data_A + indices_d[sdiag]); + + diag = 1.0; + if (Ap[Ai] < 0) + { + diag = -1.0; + } + + for (si = 0; si < stencil_size; si++) + { + Ap = hypre_StructGetMatrixBoxData(A, i, si); + /* x-direction */ + Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]); + if (Astenc) + { + tcx -= Ap[Ai]*diag; + } + } + + cx += tcx; + } + hypre_newBoxLoop1ReductionEnd(Ai,cx); +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcx +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:sqcx) + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,sqcx); + { + HYPRE_Int tcx = 0.0; + HYPRE_Real *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + + /* get sign of diagonal */ + Ap = hypre_StructGetMatrixBoxData(A, i, sdiag); + diag = 1.0; + if (Ap[Ai] < 0) + { + diag = -1.0; + } + + for (si = 0; si < stencil_size; si++) + { + Ap = hypre_StructGetMatrixBoxData(A, i, si); + + /* x-direction */ + Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]); + if (Astenc) + { + tcx -= Ap[Ai]*diag; + } + } + sqcx += (tcx*tcx); + } + hypre_newBoxLoop1ReductionEnd(Ai,sqcx); +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcy +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:cy) + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,cy); + { + HYPRE_Int tcy = 0.0; + HYPRE_Real *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + + /* get sign of diagonal */ + Ap = hypre_StructGetMatrixBoxData(A, i, sdiag); + diag = 1.0; + if (Ap[Ai] < 0) + { + diag = -1.0; + } + + for (si = 0; si < stencil_size; si++) + { + Ap = hypre_StructGetMatrixBoxData(A, i, si); + + /* y-direction */ + Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]); + if (Astenc) + { + tcy -= Ap[Ai]*diag; + } + } + + cy += tcy; + } + hypre_newBoxLoop1ReductionEnd(Ai,cy); +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcy +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:sqcy) + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,sqcy); + { + HYPRE_Int tcy = 0.0; + HYPRE_Real *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + + /* get sign of diagonal */ + Ap = hypre_StructGetMatrixBoxData(A, i, sdiag); + diag = 1.0; + if (Ap[Ai] < 0) + { + diag = -1.0; + } + + for (si = 0; si < stencil_size; si++) + { + Ap = hypre_StructGetMatrixBoxData(A, i, si); + + /* y-direction */ + Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]); + if (Astenc) + { + tcy -= Ap[Ai]*diag; + } + } + sqcy += (tcy*tcy); + } + hypre_newBoxLoop1ReductionEnd(Ai,sqcy); +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcz +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:cz) + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,cz); + { + HYPRE_Int tcz = 0.0; + HYPRE_Real *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + + /* get sign of diagonal */ + Ap = hypre_StructGetMatrixBoxData(A, i, sdiag); + diag = 1.0; + if (Ap[Ai] < 0) + { + diag = -1.0; + } + + for (si = 0; si < stencil_size; si++) + { + Ap = hypre_StructGetMatrixBoxData(A, i, si); + + /* z-direction */ + Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]); + if (Astenc) + { + tcz -= Ap[Ai]*diag; + } + } + + cz += tcz; + } + hypre_newBoxLoop1ReductionEnd(Ai,cz); +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcz +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:sqcz) + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,sqcz); + { + HYPRE_Int tcz = 0.0; + HYPRE_Real *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + + /* get sign of diagonal */ + Ap = hypre_StructGetMatrixBoxData(A, i, sdiag); + diag = 1.0; + if (Ap[Ai] < 0) + { + diag = -1.0; + } + + for (si = 0; si < stencil_size; si++) + { + Ap = hypre_StructGetMatrixBoxData(A, i, si); + + /* z-direction */ + Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]); + if (Astenc) + { + tcz -= Ap[Ai]*diag; + } + } + sqcz += (tcz*tcz); + } + hypre_newBoxLoop1ReductionEnd(Ai,sqcz); + hypre_StructCleanIndexD(); +#else hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai); #ifdef HYPRE_USING_OPENMP @@ -737,6 +983,11 @@ #endif hypre_BoxLoop1For(Ai) { + HYPRE_Int tcx,tcy,tcz; + HYPRE_Real *Ap; + HYPRE_Int Astenc,si; + HYPRE_Real diag; + tcx = 0.0; tcy = 0.0; tcz = 0.0; @@ -784,9 +1035,10 @@ sqcz += (tcz*tcz); } hypre_BoxLoop1End(Ai); +#endif } } - + cxyz[0] = cx; cxyz[1] = cy; cxyz[2] = cz; @@ -850,7 +1102,7 @@ } else { - dxyz[d] = 1.0e+123; + dxyz[d] = HYPRE_REAL_MAX/1000; } } @@ -910,6 +1162,11 @@ } else { + /*FIXME: need reduction for multiplication*/ +#if defined(HYPRE_USE_CUDA) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) + hypre_newBoxLoop1ReductionMult(hypre_StructMatrixNDim(A), loop_size, + A_dbox, start, stride, Ai,Ap,diag_product); +#else hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai); #ifdef HYPRE_USING_OPENMP @@ -920,6 +1177,7 @@ diag_product *= Ap[Ai]; } hypre_BoxLoop1End(Ai); +#endif } } diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup_interp.c hypre-2.13.0/src/struct_ls/pfmg_setup_interp.c --- hypre-2.11.2/src/struct_ls/pfmg_setup_interp.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg_setup_interp.c 2017-10-20 17:42:22.000000000 +0000 @@ -239,25 +239,25 @@ HYPRE_Int si0, HYPRE_Int si1 ) { - HYPRE_Int si; - HYPRE_Int Ai, Pi; - HYPRE_Real *Ap; - HYPRE_Real center; - HYPRE_Int Astenc; - HYPRE_Int mrk0, mrk1; hypre_StructStencil *stencil = hypre_StructMatrixStencil(A); hypre_Index *stencil_shape = hypre_StructStencilShape(stencil); HYPRE_Int stencil_size = hypre_StructStencilSize(stencil); HYPRE_Int warning_cnt= 0; + hypre_MatrixIndexMove(A, stencil_size, i, cdir,1); + hypre_BoxLoop2Begin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, P_dbox, startc, stridec, Pi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,Pi,si,center,Ap,Astenc,mrk0,mrk1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, Pi) { + HYPRE_Int si,mrk0,mrk1,Astenc; + HYPRE_Real center; + HYPRE_Real *Ap; + center = 0.0; Pp0[Pi] = 0.0; Pp1[Pi] = 0.0; @@ -266,9 +266,10 @@ for (si = 0; si < stencil_size; si++) { - Ap = hypre_StructMatrixBoxData(A, i, si); - Astenc = hypre_IndexD(stencil_shape[si], cdir); - + Ap = hypre_StructGetMatrixBoxData(A, i, si); + + Astenc = hypre_StructGetIndexD(stencil_shape[si], cdir,stencil_shape_d[si]); + if (Astenc == 0) { center += Ap[Ai]; @@ -281,7 +282,7 @@ { Pp1[Pi] -= Ap[Ai]; } - + if (si == si0 && Ap[Ai] == 0.0) mrk0++; if (si == si1 && Ap[Ai] == 0.0) @@ -290,7 +291,7 @@ if (!center) { - warning_cnt++; + //warning_cnt++; Pp0[Pi] = 0.0; Pp1[Pi] = 0.0; } @@ -449,9 +450,9 @@ HYPRE_Int Pi; HYPRE_Real *Ap; HYPRE_Real P0, P1; - HYPRE_Real center, center_offd; + HYPRE_Real center_offd; HYPRE_Int Astenc; - HYPRE_Int mrk0, mrk1, mrk0_offd, mrk1_offd; + HYPRE_Int mrk0_offd, mrk1_offd; hypre_StructStencil *stencil = hypre_StructMatrixStencil(A); hypre_Index *stencil_shape = hypre_StructStencilShape(stencil); HYPRE_Int stencil_size = hypre_StructStencilSize(stencil); @@ -512,23 +513,34 @@ } si = diag_rank; + + hypre_MatrixIndexMove(A, stencil_size, i, si, 1); hypre_BoxLoop2Begin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, P_dbox, startc, stridec, Pi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,Pi,center,Ap,Astenc,mrk0,mrk1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, Pi) { - Pp0[Pi] = P0; - Pp1[Pi] = P1; + HYPRE_Int mrk0,mrk1; + HYPRE_Real center; + HYPRE_Real *Ap; + HYPRE_Real p0val,p1val; + + //Pp0[Pi] = P0; + //Pp1[Pi] = P1; + p0val = P0; + p1val = P1; + center = center_offd; mrk0 = mrk0_offd; mrk1 = mrk1_offd; + + Ap = hypre_StructGetMatrixBoxData(A, i, si); + //Astenc = hypre_IndexD(stencil_shape[si], cdir); + //hypre_assert( Astenc==0 ); - Ap = hypre_StructMatrixBoxData(A, i, si); - Astenc = hypre_IndexD(stencil_shape[si], cdir); - hypre_assert( Astenc==0 ); center += Ap[Ai]; if (si == si0 && Ap[Ai] == 0.0) @@ -538,14 +550,18 @@ if (!center) { - warning_cnt++; - Pp0[Pi] = 0.0; - Pp1[Pi] = 0.0; + //warning_cnt++; + //Pp0[Pi] = 0.0; + //Pp1[Pi] = 0.0; + p0val = 0; + p1val = 0; } else { - Pp0[Pi] /= center; - Pp1[Pi] /= center; + //Pp0[Pi] /= center; + //Pp1[Pi] /= center; + p0val /= center; + p1val /= center; } /*---------------------------------------------- @@ -554,13 +570,17 @@ * interpolation and operator stencils reaching * outside domain. *----------------------------------------------*/ + if (mrk0 != 0) Pp0[Pi] = 0.0; if (mrk1 != 0) Pp1[Pi] = 0.0; + Pp0[Pi] = p0val; + Pp1[Pi] = p1val; } hypre_BoxLoop2End(Ai, Pi); + //hypre_StructCleanIndexD(); } if (warning_cnt) diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup_rap5.c hypre-2.13.0/src/struct_ls/pfmg_setup_rap5.c --- hypre-2.11.2/src/struct_ls/pfmg_setup_rap5.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg_setup_rap5.c 2017-10-20 17:42:22.000000000 +0000 @@ -194,12 +194,8 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_ce; HYPRE_Real *rap_cb, *rap_ca; - HYPRE_Real west, east; - HYPRE_Real center_int, center_bdy; - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iPm1, iPp1; + HYPRE_Real center_int, center_bdy; HYPRE_Int OffsetA; HYPRE_Int OffsetP; @@ -348,10 +344,13 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iA,iAc,iAm1,iAp1,iPm1,iPp1,west,east) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(iP, iA, iAc) { + HYPRE_Int iAm1,iAp1,iPm1,iPp1; + HYPRE_Real west, east; + iAm1 = iA - OffsetA; iAp1 = iA + OffsetA; @@ -407,7 +406,7 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { @@ -442,7 +441,7 @@ A_dbox, bfstart, stridef, iA, RAP_dbox, bcstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(iA, iAc) { diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup_rap7.c hypre-2.13.0/src/struct_ls/pfmg_setup_rap7.c --- hypre-2.11.2/src/struct_ls/pfmg_setup_rap7.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg_setup_rap7.c 2017-10-20 17:42:22.000000000 +0000 @@ -201,12 +201,7 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_ce, *rap_cs, *rap_cn; HYPRE_Real *rap_cb, *rap_ca; - HYPRE_Real west, east, south, north; HYPRE_Real center_int, center_bdy; - - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iPm1, iPp1; HYPRE_Int OffsetA; HYPRE_Int OffsetP; @@ -373,10 +368,13 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iA,iAc,iAm1,iAp1,iPm1,iPp1,west,east,south,north) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(iP, iA, iAc) { + HYPRE_Int iAm1,iAp1,iPm1,iPp1; + HYPRE_Real west,east,south,north; + iAm1 = iA - OffsetA; iAp1 = iA + OffsetA; diff -Nru hypre-2.11.2/src/struct_ls/pfmg_solve.c hypre-2.13.0/src/struct_ls/pfmg_solve.c --- hypre-2.11.2/src/struct_ls/pfmg_solve.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/pfmg_solve.c 2017-10-20 17:42:22.000000000 +0000 @@ -64,7 +64,7 @@ HYPRE_Int *active_l = (pfmg_data -> active_l); HYPRE_Real b_dot_b = 0, r_dot_r, eps = 0; - HYPRE_Real e_dot_e, x_dot_x; + HYPRE_Real e_dot_e = 0.0, x_dot_x = 1.0; HYPRE_Int i, l; HYPRE_Int constant_coefficient; diff -Nru hypre-2.11.2/src/struct_ls/point_relax.c hypre-2.13.0/src/struct_ls/point_relax.c --- hypre-2.11.2/src/struct_ls/point_relax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/point_relax.c 2017-10-20 17:42:22.000000000 +0000 @@ -102,7 +102,7 @@ HYPRE_Int hypre_PointRelaxDestroy( void *relax_vdata ) { - hypre_PointRelaxData *relax_data = (hypre_PointRelaxData *)relax_vdata; + hypre_PointRelaxData *relax_data = (hypre_PointRelaxData *)relax_vdata; HYPRE_Int i; if (relax_data) @@ -350,9 +350,6 @@ void *matvec_data = NULL; HYPRE_Int Ai; - HYPRE_Int bi; - HYPRE_Int xi; - HYPRE_Int ti; hypre_IndexRef stride; hypre_IndexRef start; @@ -450,7 +447,7 @@ hypre_BoxArrayBox(hypre_StructVectorDataSpace(b), i); x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i); - + Ap = hypre_StructMatrixBoxData(A, i, diag_rank); bp = hypre_StructVectorBoxData(b, i); xp = hypre_StructVectorBoxData(x, i); @@ -471,7 +468,7 @@ b_data_box, start, stride, bi, x_data_box, start, stride, xi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(bi, xi) { @@ -488,7 +485,7 @@ b_data_box, start, stride, bi, x_data_box, start, stride, xi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,bi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, bi, xi) { @@ -608,7 +605,7 @@ A_data_box, start, stride, Ai, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, ti) { @@ -711,10 +708,6 @@ hypre_IndexRef start; hypre_Index loop_size; HYPRE_Int si, sk, ssi[MAX_DEPTH], depth, k; - HYPRE_Int Ai; - HYPRE_Int bi; - HYPRE_Int xi; - HYPRE_Int ti; stencil = hypre_StructMatrixStencil(A); stencil_shape = hypre_StructStencilShape(stencil); @@ -726,7 +719,7 @@ b_data_box, start, stride, bi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(bi, ti) { @@ -802,7 +795,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -824,7 +817,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -845,7 +838,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -865,7 +858,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -884,7 +877,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -902,7 +895,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -919,7 +912,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ti) { @@ -991,9 +984,6 @@ hypre_Index loop_size; HYPRE_Int si, sk, ssi[MAX_DEPTH], depth, k; HYPRE_Int Ai; - HYPRE_Int bi; - HYPRE_Int xi; - HYPRE_Int ti; stencil = hypre_StructMatrixStencil(A); stencil_shape = hypre_StructStencilShape(stencil); @@ -1018,7 +1008,7 @@ b_data_box, start, stride, bi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(bi, ti) { @@ -1033,7 +1023,7 @@ b_data_box, start, stride, bi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(bi, ti) { @@ -1117,7 +1107,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1144,7 +1134,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1169,7 +1159,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1192,7 +1182,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1213,7 +1203,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1232,7 +1222,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1249,7 +1239,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1514,7 +1504,7 @@ HYPRE_Real weightc = 1 - weight; HYPRE_Real *xp, *tp; - HYPRE_Int compute_i, i, j, xi, ti; + HYPRE_Int compute_i, i, j; hypre_BoxArrayArray *compute_box_aa; hypre_BoxArray *compute_box_a; @@ -1565,7 +1555,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { @@ -1598,7 +1588,7 @@ hypre_Index loop_size; HYPRE_Real *xp, *tp; - HYPRE_Int compute_i, i, j, xi, ti; + HYPRE_Int compute_i, i, j; hypre_BoxArrayArray *compute_box_aa; hypre_BoxArray *compute_box_a; @@ -1649,7 +1639,7 @@ x_data_box, start, stride, xi, t_data_box, start, stride, ti); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, ti) { diff -Nru hypre-2.11.2/src/struct_ls/red_black_constantcoef_gs.c hypre-2.13.0/src/struct_ls/red_black_constantcoef_gs.c --- hypre-2.11.2/src/struct_ls/red_black_constantcoef_gs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/red_black_constantcoef_gs.c 2017-10-20 17:42:22.000000000 +0000 @@ -26,7 +26,7 @@ hypre_StructVector *b, hypre_StructVector *x ) { - hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata; + hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata; HYPRE_Int max_iter = (relax_data -> max_iter); HYPRE_Int zero_guess = (relax_data -> zero_guess); @@ -46,8 +46,8 @@ hypre_Box *x_dbox; HYPRE_Int Ai, Astart, Ani, Anj; - HYPRE_Int bi, bstart, bni, bnj; - HYPRE_Int xi, xstart, xni, xnj; + HYPRE_Int bstart, bni, bnj; + HYPRE_Int xstart, xni, xnj; HYPRE_Int xoff0, xoff1, xoff2, xoff3, xoff4, xoff5; HYPRE_Real *Ap; @@ -68,7 +68,7 @@ HYPRE_Int offd[6]; HYPRE_Int iter, rb, redblack, d; - HYPRE_Int compute_i, i, j, ii, jj, kk; + HYPRE_Int compute_i, i, j; HYPRE_Int ni, nj, nk; /*---------------------------------------------------------- @@ -194,22 +194,17 @@ Ai= hypre_CCBoxIndexRank(A_dbox, start); AApd= 1.0/Ap[Ai]; + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, bi+=2, xi+=2) - { - xp[xi] = bp[bi]*AApd; - } - } + xp[xi] = bp[bi]*AApd; } + hypre_RedBlackConstantcoefLoopEnd(); } else /* variable coefficient diag */ @@ -218,23 +213,18 @@ Ani = hypre_BoxSizeX(A_dbox); Anj = hypre_BoxSizeY(A_dbox); + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = bp[bi] / Ap[Ai]; - } - } + xp[xi] = bp[bi] / Ap[Ai]; } + hypre_RedBlackLoopEnd(); } } @@ -358,75 +348,61 @@ switch(stencil_size) { case 7: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - App0*xp[xi + xoff0] - - App1*xp[xi + xoff1] - - App2*xp[xi + xoff2] - - App3*xp[xi + xoff3] - - App4*xp[xi + xoff4] - - App5*xp[xi + xoff5])*AApd; - } - } + xp[xi] = + (bp[bi] - + App0*xp[xi + xoff0] - + App1*xp[xi + xoff1] - + App2*xp[xi + xoff2] - + App3*xp[xi + xoff3] - + App4*xp[xi + xoff4] - + App5*xp[xi + xoff5])*AApd; } + hypre_RedBlackConstantcoefLoopEnd(); + break; case 5: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - App0*xp[xi + xoff0] - - App1*xp[xi + xoff1] - - App2*xp[xi + xoff2] - - App3*xp[xi + xoff3])*AApd; - } - } + xp[xi] = + (bp[bi] - + App0*xp[xi + xoff0] - + App1*xp[xi + xoff1] - + App2*xp[xi + xoff2] - + App3*xp[xi + xoff3])*AApd; } + hypre_RedBlackConstantcoefLoopEnd(); break; case 3: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - App0*xp[xi + xoff0] - - App1*xp[xi + xoff1])*AApd; - } - } + xp[xi] = + (bp[bi] - + App0*xp[xi + xoff0] - + App1*xp[xi + xoff1])*AApd; } + hypre_RedBlackConstantcoefLoopEnd(); break; } @@ -441,78 +417,63 @@ switch(stencil_size) { case 7: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - App0*xp[xi + xoff0] - - App1*xp[xi + xoff1] - - App2*xp[xi + xoff2] - - App3*xp[xi + xoff3] - - App4*xp[xi + xoff4] - - App5*xp[xi + xoff5]) / Ap[Ai]; - } - } + xp[xi] = + (bp[bi] - + App0*xp[xi + xoff0] - + App1*xp[xi + xoff1] - + App2*xp[xi + xoff2] - + App3*xp[xi + xoff3] - + App4*xp[xi + xoff4] - + App5*xp[xi + xoff5]) / Ap[Ai]; } + hypre_RedBlackLoopEnd(); break; case 5: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - App0*xp[xi + xoff0] - - App1*xp[xi + xoff1] - - App2*xp[xi + xoff2] - - App3*xp[xi + xoff3]) / Ap[Ai]; - } - } + xp[xi] = + (bp[bi] - + App0*xp[xi + xoff0] - + App1*xp[xi + xoff1] - + App2*xp[xi + xoff2] - + App3*xp[xi + xoff3]) / Ap[Ai]; } + hypre_RedBlackLoopEnd(); break; case 3: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - App0*xp[xi + xoff0] - - App1*xp[xi + xoff1]) / Ap[Ai]; - } - } + xp[xi] = + (bp[bi] - + App0*xp[xi + xoff0] - + App1*xp[xi + xoff1]) / Ap[Ai]; } + hypre_RedBlackLoopEnd(); break; } /* switch(stencil_size) */ diff -Nru hypre-2.11.2/src/struct_ls/red_black_gs.c hypre-2.13.0/src/struct_ls/red_black_gs.c --- hypre-2.11.2/src/struct_ls/red_black_gs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/red_black_gs.c 2017-10-20 17:42:22.000000000 +0000 @@ -53,7 +53,7 @@ HYPRE_Int hypre_RedBlackGSDestroy( void *relax_vdata ) { - hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata; + hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata; if (relax_data) { @@ -147,9 +147,9 @@ hypre_Box *b_dbox; hypre_Box *x_dbox; - HYPRE_Int Ai, Astart, Ani, Anj; - HYPRE_Int bi, bstart, bni, bnj; - HYPRE_Int xi, xstart, xni, xnj; + HYPRE_Int Astart, Ani, Anj; + HYPRE_Int bstart, bni, bnj; + HYPRE_Int xstart, xni, xnj; HYPRE_Int xoff0, xoff1, xoff2, xoff3, xoff4, xoff5; HYPRE_Real *Ap; @@ -166,7 +166,7 @@ HYPRE_Int offd[6]; HYPRE_Int iter, rb, redblack, d; - HYPRE_Int compute_i, i, j, ii, jj, kk; + HYPRE_Int compute_i, i, j; HYPRE_Int ni, nj, nk; /*---------------------------------------------------------- @@ -288,23 +288,18 @@ } } + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = bp[bi] / Ap[Ai]; - } - } + xp[xi] = bp[bi] / Ap[Ai]; } + hypre_RedBlackLoopEnd(); } } } @@ -418,78 +413,64 @@ switch(stencil_size) { case 7: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - Ap0[Ai] * xp[xi + xoff0] - - Ap1[Ai] * xp[xi + xoff1] - - Ap2[Ai] * xp[xi + xoff2] - - Ap3[Ai] * xp[xi + xoff3] - - Ap4[Ai] * xp[xi + xoff4] - - Ap5[Ai] * xp[xi + xoff5]) / Ap[Ai]; - } - } + xp[xi] = + (bp[bi] - + Ap0[Ai] * xp[xi + xoff0] - + Ap1[Ai] * xp[xi + xoff1] - + Ap2[Ai] * xp[xi + xoff2] - + Ap3[Ai] * xp[xi + xoff3] - + Ap4[Ai] * xp[xi + xoff4] - + Ap5[Ai] * xp[xi + xoff5]) / Ap[Ai]; } + hypre_RedBlackLoopEnd(); break; case 5: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - Ap0[Ai] * xp[xi + xoff0] - - Ap1[Ai] * xp[xi + xoff1] - - Ap2[Ai] * xp[xi + xoff2] - - Ap3[Ai] * xp[xi + xoff3]) / Ap[Ai]; - } - } + xp[xi] = + (bp[bi] - + Ap0[Ai] * xp[xi + xoff0] - + Ap1[Ai] * xp[xi + xoff1] - + Ap2[Ai] * xp[xi + xoff2] - + Ap3[Ai] * xp[xi + xoff3]) / Ap[Ai]; } + hypre_RedBlackLoopEnd(); break; case 3: + hypre_RedBlackLoopInit(); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (kk = 0; kk < nk; kk++) + hypre_RedBlackLoopBegin(ni,nj,nk,redblack, + Astart,Ani,Anj,Ai, + bstart,bni,bnj,bi, + xstart,xni,xnj,xi); { - for (jj = 0; jj < nj; jj++) - { - ii = (kk + jj + redblack) % 2; - Ai = Astart + kk*Anj*Ani + jj*Ani + ii; - bi = bstart + kk*bnj*bni + jj*bni + ii; - xi = xstart + kk*xnj*xni + jj*xni + ii; - for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2) - { - xp[xi] = - (bp[bi] - - Ap0[Ai] * xp[xi + xoff0] - - Ap1[Ai] * xp[xi + xoff1]) / Ap[Ai]; - } - } + xp[xi] = + (bp[bi] - + Ap0[Ai] * xp[xi + xoff0] - + Ap1[Ai] * xp[xi + xoff1]) / Ap[Ai]; } + hypre_RedBlackLoopEnd(); + break; } } diff -Nru hypre-2.11.2/src/struct_ls/red_black_gs.h hypre-2.13.0/src/struct_ls/red_black_gs.h --- hypre-2.11.2/src/struct_ls/red_black_gs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/red_black_gs.h 2017-10-20 17:42:22.000000000 +0000 @@ -42,3 +42,226 @@ } hypre_RedBlackGSData; +#ifdef HYPRE_USE_RAJA +#define HYPRE_REDBLACK_PRIVATE hypre__global_error +#define hypre_RedBlackLoopInit() +#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\ + Astart,Ani,Anj,Ai, \ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,Ai,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + Ai = Astart + kk*Anj*Ani + jj*Ani + ii; \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackLoopEnd() \ + } \ + }); \ + hypre_fence(); \ +} + +#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackConstantcoefLoopEnd() \ + } \ + }); \ + hypre_fence(); \ +} +#elif defined(HYPRE_USE_KOKKOS) +#define HYPRE_REDBLACK_PRIVATE hypre__global_error +#define hypre_RedBlackLoopInit() +#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\ + Astart,Ani,Anj,Ai, \ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + HYPRE_Int hypre_fake = 0; \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,Ai,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + Ai = Astart + kk*Anj*Ani + jj*Ani + ii; \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackLoopEnd() \ + } \ + }); \ + hypre_fence(); \ +} + +#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackConstantcoefLoopEnd() \ + } \ + }); \ + hypre_fence(); \ +} +#elif defined(HYPRE_USE_CUDA) +#define HYPRE_REDBLACK_PRIVATE hypre__global_error +#define hypre_RedBlackLoopInit() +#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\ + Astart,Ani,Anj,Ai, \ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + BoxLoopforall(cuda_traversal(),hypre__tot,[=] __device__ (HYPRE_Int idx) \ + { \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,Ai,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + Ai = Astart + kk*Anj*Ani + jj*Ani + ii; \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackLoopEnd() \ + } \ + }); \ + hypre_fence(); \ +} + +#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + BoxLoopforall(cuda_traversal(),hypre__tot,[=] __device__ (HYPRE_Int idx) \ + { \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackConstantcoefLoopEnd() \ + } \ + }); \ + hypre_fence(); \ +} +#else +#define HYPRE_REDBLACK_PRIVATE hypre__kk +#define hypre_RedBlackLoopInit()\ +{\ + HYPRE_Int hypre__kk; + +#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\ + Astart,Ani,Anj,Ai,\ + bstart,bni,bnj,bi,\ + xstart,xni,xnj,xi)\ + for (hypre__kk = 0; hypre__kk < nk; hypre__kk++)\ + {\ + HYPRE_Int ii,jj,Ai,bi,xi;\ + for (jj = 0; jj < nj; jj++)\ + {\ + ii = (hypre__kk + jj + redblack) % 2;\ + Ai = Astart + hypre__kk*Anj*Ani + jj*Ani + ii;\ + bi = bstart + hypre__kk*bnj*bni + jj*bni + ii;\ + xi = xstart + hypre__kk*xnj*xni + jj*xni + ii;\ + for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)\ + { + +#define hypre_RedBlackLoopEnd()\ + }\ + }\ + }\ +} + +#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\ + bstart,bni,bnj,bi,\ + xstart,xni,xnj,xi)\ + for (hypre__kk = 0; hypre__kk < nk; hypre__kk++)\ + {\ + HYPRE_Int ii,jj,bi,xi;\ + for (jj = 0; jj < nj; jj++)\ + {\ + ii = (hypre__kk + jj + redblack) % 2;\ + bi = bstart + hypre__kk*bnj*bni + jj*bni + ii;\ + xi = xstart + hypre__kk*xnj*xni + jj*xni + ii;\ + for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)\ + { + +#define hypre_RedBlackConstantcoefLoopEnd()\ + }\ + }\ + }\ +} +#endif diff -Nru hypre-2.11.2/src/struct_ls/semi_interp.c hypre-2.13.0/src/struct_ls/semi_interp.c --- hypre-2.11.2/src/struct_ls/semi_interp.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/semi_interp.c 2017-10-20 17:42:22.000000000 +0000 @@ -125,8 +125,6 @@ hypre_Box *e_dbox; HYPRE_Int Pi; - HYPRE_Int xci; - HYPRE_Int ei; HYPRE_Int constant_coefficient; HYPRE_Real *Pp0, *Pp1; @@ -201,7 +199,7 @@ e_dbox, start, stride, ei, xc_dbox, startc, stridec, xci); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(ei, xci) { @@ -280,7 +278,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(P), loop_size, e_dbox, start, stride, ei); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(ei) { @@ -295,7 +293,7 @@ P_dbox, startc, stridec, Pi, e_dbox, start, stride, ei); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Pi,ei) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Pi, ei) { diff -Nru hypre-2.11.2/src/struct_ls/semi_restrict.c hypre-2.13.0/src/struct_ls/semi_restrict.c --- hypre-2.11.2/src/struct_ls/semi_restrict.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/semi_restrict.c 2017-10-20 17:42:22.000000000 +0000 @@ -123,8 +123,6 @@ hypre_Box *rc_dbox; HYPRE_Int Ri; - HYPRE_Int ri; - HYPRE_Int rci; HYPRE_Int constant_coefficient; HYPRE_Real *Rp0, *Rp1; @@ -248,7 +246,7 @@ r_dbox, start, stride, ri, rc_dbox, startc, stridec, rci); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ri,rci) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(ri, rci) { @@ -264,7 +262,7 @@ r_dbox, start, stride, ri, rc_dbox, startc, stridec, rci); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ri,ri,rci) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ri, ri, rci) { diff -Nru hypre-2.11.2/src/struct_ls/semi_setup_rap.c hypre-2.13.0/src/struct_ls/semi_setup_rap.c --- hypre-2.11.2/src/struct_ls/semi_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/semi_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -35,7 +35,7 @@ if (imacro==2) imacro=-1; \ if (jmacro==2) jmacro=-1; \ if (kmacro==2) kmacro=-1; \ - hypre_SetIndex3(indexRAP,imacro,jmacro,kmacro); \ + hypre_SetIndex3(indexRAP,imacro,jmacro,kmacro); \ } /*-------------------------------------------------------------------------- @@ -322,11 +322,6 @@ HYPRE_Real *rap_ptrS, *rap_ptrU, *rap_ptrD; HYPRE_Int symm_path_multiplier; - - HYPRE_Int iA, iAp; - HYPRE_Int iAc; - HYPRE_Int iP, iPp; - HYPRE_Int iR; HYPRE_Int COffsetA; HYPRE_Int COffsetP; @@ -561,10 +556,11 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { + HYPRE_Int iAp,iPp; /* path 1 : (stay,stay) */ rap_ptrS[iAc] += a_ptr[iA] ; @@ -597,10 +593,11 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { + HYPRE_Int iAp,iPp; /* path 1 : (stay,stay) */ rap_ptrS[iAc] += a_ptr[iA] ; @@ -670,10 +667,11 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { + HYPRE_Int iAp,iPp; /* Path 1 : (stay,up) & symmetric path */ iPp = iP + AOffsetP; rap_ptrS[iAc] += symm_path_multiplier * @@ -739,10 +737,11 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { + HYPRE_Int iAp,iPp; /* Path 1 : (stay,up) */ iPp = iP + COffsetP + AOffsetP; rap_ptrU[iAc] += a_ptr[iA] * pb[iPp]; diff -Nru hypre-2.11.2/src/struct_ls/smg2_setup_rap.c hypre-2.13.0/src/struct_ls/smg2_setup_rap.c --- hypre-2.11.2/src/struct_ls/smg2_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg2_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -186,11 +186,6 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_cs; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; - HYPRE_Int yOffsetA; HYPRE_Int xOffsetP; HYPRE_Int yOffsetP; @@ -337,7 +332,7 @@ * Switch statement to direct control to apropriate BoxLoop depending * on stencil size. Default is full 9-point. *-----------------------------------------------------------------*/ - + switch (fine_stencil_size) { @@ -357,29 +352,28 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; - - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]; - + iP1 = iP - yOffsetP; rap_cs[iAc] = rb[iR] * a_cc[iAm1] * pa[iP1] + rb[iR] * a_cs[iAm1] + a_cs[iA] * pa[iP1]; - + iP1 = iP - yOffsetP + xOffsetP; rap_cse[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1]; - + iP1 = iP - xOffsetP; rap_cw[iAc] = a_cw[iA] + rb[iR] * a_cw[iAm1] * pb[iP1] + ra[iR] * a_cw[iAp1] * pa[iP1]; - + rap_cc[iAc] = a_cc[iA] + rb[iR] * a_cc[iAm1] * pb[iP] + ra[iR] * a_cc[iAp1] * pa[iP] @@ -387,10 +381,9 @@ + ra[iR] * a_cs[iAp1] + a_cs[iA] * pb[iP] + a_cn[iA] * pa[iP]; - } hypre_BoxLoop4End(iP, iR, iA, iAc); - + break; /*-------------------------------------------------------------- @@ -409,14 +402,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1] + rb[iR] * a_csw[iAm1] + a_csw[iA] * pa[iP1]; @@ -506,11 +499,6 @@ HYPRE_Real *rap_ce, *rap_cn; HYPRE_Real *rap_cnw, *rap_cne; - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; - HYPRE_Int yOffsetA; HYPRE_Int xOffsetP; HYPRE_Int yOffsetP; @@ -663,7 +651,7 @@ *--------------------------------------------------------------*/ case 5: - + hypre_BoxGetSize(cgrid_box, loop_size); hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, PT_dbox, cstart, stridec, iP, @@ -671,14 +659,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]; iP1 = iP + yOffsetP; @@ -706,7 +694,6 @@ *--------------------------------------------------------------*/ default: - hypre_BoxGetSize(cgrid_box, loop_size); hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, PT_dbox, cstart, stridec, iP, @@ -714,14 +701,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1] + ra[iR] * a_cne[iAp1] + a_cne[iA] * pb[iP1]; @@ -785,9 +772,6 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_cs; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iAc; - HYPRE_Int iAcm1; - HYPRE_Int xOffset; HYPRE_Real zero = 0.0; @@ -836,11 +820,11 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcm1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { - iAcm1 = iAc - xOffset; + HYPRE_Int iAcm1 = iAc - xOffset; rap_cw[iAc] += (rap_cse[iAcm1] + rap_csw[iAc]); rap_cc[iAc] += (2.0 * rap_cs[iAc]); @@ -850,7 +834,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { @@ -895,8 +879,6 @@ HYPRE_Real *rap_ce, *rap_cn; HYPRE_Real *rap_cnw, *rap_cne; - HYPRE_Int iAc; - HYPRE_Real zero = 0.0; hypre_SetIndex3(stridec, 1, 1, 1); @@ -950,7 +932,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { diff -Nru hypre-2.11.2/src/struct_ls/smg3_setup_rap.c hypre-2.13.0/src/struct_ls/smg3_setup_rap.c --- hypre-2.11.2/src/struct_ls/smg3_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg3_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -268,11 +268,6 @@ HYPRE_Real *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn; HYPRE_Real *rap_csw, *rap_cse; HYPRE_Real *rap_bsw, *rap_bse, *rap_bnw, *rap_bne; - - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; HYPRE_Int zOffsetA; HYPRE_Int xOffsetP; @@ -574,14 +569,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) - { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + { + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP; rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1]; iP1 = iP - zOffsetP - xOffsetP; @@ -638,14 +633,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP; rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1] + rb[iR] * a_bs[iAm1] + a_bs[iA] * pa[iP1]; @@ -719,14 +714,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]; iP1 = iP - zOffsetP - yOffsetP; @@ -759,7 +754,7 @@ rap_bn[iAc] = rb[iR] * a_cn[iAm1] * pa[iP1] + rb[iR] * a_bn[iAm1] + a_bn[iA] * pa[iP1]; - + iP1 = iP - zOffsetP + yOffsetP + xOffsetP; rap_bne[iAc] = rb[iR] * a_cne[iAm1] * pa[iP1]; @@ -822,14 +817,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1] + rb[iR] * a_bsw[iAm1] + a_bsw[iA] * pa[iP1]; @@ -982,11 +977,6 @@ HYPRE_Real *rap_cnw, *rap_cne; HYPRE_Real *rap_asw, *rap_ase, *rap_anw, *rap_ane; - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; - HYPRE_Int zOffsetA; HYPRE_Int xOffsetP; HYPRE_Int yOffsetP; @@ -1282,14 +1272,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP; rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1]; iP1 = iP + zOffsetP + xOffsetP; @@ -1337,14 +1327,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP; rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1] + ra[iR] * a_an[iAp1] + a_an[iA] * pb[iP1]; @@ -1411,14 +1401,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]; iP1 = iP + zOffsetP + yOffsetP; @@ -1443,7 +1433,7 @@ rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1] + ra[iR] * a_aw[iAp1] + a_aw[iA] * pb[iP1]; - + iP1 = iP + zOffsetP - yOffsetP + xOffsetP; rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1]; @@ -1506,14 +1496,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1] + ra[iR] * a_ane[iAp1] + a_ane[iA] * pb[iP1]; @@ -1637,12 +1627,6 @@ HYPRE_Real *rap_bsw, *rap_bse, *rap_bnw, *rap_bne; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iAc; - HYPRE_Int iAcmx; - HYPRE_Int iAcmy; - HYPRE_Int iAcmxmy; - HYPRE_Int iAcpxmy; - HYPRE_Int xOffset; HYPRE_Int yOffset; @@ -1739,12 +1723,12 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcmx,iAcmy) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { - iAcmx = iAc - xOffset; - iAcmy = iAc - yOffset; + HYPRE_Int iAcmx = iAc - xOffset; + HYPRE_Int iAcmy = iAc - yOffset; rap_cc[iAc] += (2.0 * rap_bc[iAc]); rap_cw[iAc] += (rap_bw[iAc] + rap_be[iAcmx]); @@ -1755,7 +1739,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { @@ -1778,12 +1762,12 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcmxmy,iAcpxmy) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { - iAcmxmy = iAc - xOffset - yOffset; - iAcpxmy = iAc + xOffset - yOffset; + HYPRE_Int iAcmxmy = iAc - xOffset - yOffset; + HYPRE_Int iAcpxmy = iAc + xOffset - yOffset; rap_csw[iAc] += (rap_bsw[iAc] + rap_bne[iAcmxmy]); @@ -1795,7 +1779,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { @@ -1845,8 +1829,6 @@ HYPRE_Real *rap_csw, *rap_cse, *rap_cnw, *rap_cne; HYPRE_Real *rap_asw, *rap_ase, *rap_anw, *rap_ane; - HYPRE_Int iAc; - HYPRE_Real zero = 0.0; hypre_StructStencil *stencil; @@ -1974,7 +1956,7 @@ RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) @@ -2012,7 +1994,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(iAc) { diff -Nru hypre-2.11.2/src/struct_ls/smg_axpy.c hypre-2.13.0/src/struct_ls/smg_axpy.c --- hypre-2.11.2/src/struct_ls/smg_axpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg_axpy.c 2017-10-20 17:42:22.000000000 +0000 @@ -25,9 +25,6 @@ HYPRE_Int ndim = hypre_StructVectorNDim(x); hypre_Box *x_data_box; hypre_Box *y_data_box; - - HYPRE_Int xi; - HYPRE_Int yi; HYPRE_Real *xp; HYPRE_Real *yp; @@ -58,7 +55,7 @@ x_data_box, start, base_stride, xi, y_data_box, start, base_stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { diff -Nru hypre-2.11.2/src/struct_ls/smg.c hypre-2.13.0/src/struct_ls/smg.c --- hypre-2.11.2/src/struct_ls/smg.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg.c 2017-10-20 17:42:22.000000000 +0000 @@ -53,7 +53,7 @@ HYPRE_Int hypre_SMGDestroy( void *smg_vdata ) { - hypre_SMGData *smg_data = (hypre_SMGData *)smg_vdata; + hypre_SMGData *smg_data = (hypre_SMGData *)smg_vdata; HYPRE_Int l; @@ -111,7 +111,7 @@ hypre_StructVectorDestroy(smg_data -> tb_l[l+1]); hypre_StructVectorDestroy(smg_data -> tx_l[l+1]); } - hypre_SharedTFree(smg_data -> data); + hypre_DeviceTFree(smg_data -> data); hypre_TFree(smg_data -> grid_l); hypre_TFree(smg_data -> PT_grid_l); hypre_TFree(smg_data -> A_l); @@ -468,7 +468,6 @@ { hypre_Box *v_data_box; - HYPRE_Int vi; HYPRE_Real *vp; hypre_Box *box; @@ -495,7 +494,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(vi) { diff -Nru hypre-2.11.2/src/struct_ls/smg_relax.c hypre-2.13.0/src/struct_ls/smg_relax.c --- hypre-2.11.2/src/struct_ls/smg_relax.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg_relax.c 2017-10-20 17:42:22.000000000 +0000 @@ -263,7 +263,6 @@ residual_data = (relax_data -> residual_data); solve_data = (relax_data -> solve_data); - /*---------------------------------------------------------- * Set zero values *----------------------------------------------------------*/ @@ -315,7 +314,6 @@ (relax_data -> num_iterations) = (i + 1); } } - /*---------------------------------------------------------- * Free up memory according to memory_use parameter *----------------------------------------------------------*/ @@ -586,6 +584,7 @@ hypre_SMGSetMemoryUse(solve_data[i], (relax_data -> memory_use)); hypre_SMGSetTol(solve_data[i], 0.0); hypre_SMGSetMaxIter(solve_data[i], 1); + hypre_SMGSetup(solve_data[i], A_sol, temp_vec, x); } else diff -Nru hypre-2.11.2/src/struct_ls/smg_residual.c hypre-2.13.0/src/struct_ls/smg_residual.c --- hypre-2.11.2/src/struct_ls/smg_residual.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg_residual.c 2017-10-20 17:42:22.000000000 +0000 @@ -138,11 +138,6 @@ hypre_Box *x_data_box; hypre_Box *b_data_box; hypre_Box *r_data_box; - - HYPRE_Int Ai; - HYPRE_Int xi; - HYPRE_Int bi; - HYPRE_Int ri; HYPRE_Real *Ap; HYPRE_Real *xp; @@ -201,7 +196,7 @@ b_data_box, start, base_stride, bi, r_data_box, start, base_stride, ri); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ri) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(bi, ri) { @@ -253,7 +248,7 @@ x_data_box, start, base_stride, xi, r_data_box, start, base_stride, ri); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ri) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, ri) { diff -Nru hypre-2.11.2/src/struct_ls/smg_setup.c hypre-2.13.0/src/struct_ls/smg_setup.c --- hypre-2.11.2/src/struct_ls/smg_setup.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg_setup.c 2017-10-20 17:42:22.000000000 +0000 @@ -186,6 +186,7 @@ for (l = 0; l < (num_levels - 1); l++) { PT_l[l] = hypre_SMGCreateInterpOp(A_l[l], PT_grid_l[l+1], cdir); + hypre_StructMatrixInitializeShell(PT_l[l]); data_size += hypre_StructMatrixDataSize(PT_l[l]); @@ -228,7 +229,8 @@ hypre_StructVectorInitializeShell(tx_l[l+1]); } - data = hypre_SharedCTAlloc(HYPRE_Real, data_size); + data = hypre_DeviceCTAlloc(HYPRE_Real,data_size); + (smg_data -> data) = data; hypre_StructVectorInitializeData(tb_l[0], data); @@ -331,6 +333,7 @@ hypre_SMGRelaxSetTempVec(relax_data_l[l], tb_l[l]); hypre_SMGRelaxSetNumPreRelax( relax_data_l[l], n_pre); hypre_SMGRelaxSetNumPostRelax( relax_data_l[l], n_post); + hypre_SMGRelaxSetup(relax_data_l[l], A_l[l], b_l[l], x_l[l]); hypre_SMGSetupInterpOp(relax_data_l[l], A_l[l], b_l[l], x_l[l], diff -Nru hypre-2.11.2/src/struct_ls/smg_setup_interp.c hypre-2.13.0/src/struct_ls/smg_setup_interp.c --- hypre-2.11.2/src/struct_ls/smg_setup_interp.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/smg_setup_interp.c 2017-10-20 17:42:22.000000000 +0000 @@ -113,8 +113,6 @@ hypre_Box *x_data_box; HYPRE_Real *PTp; HYPRE_Real *xp; - HYPRE_Int PTi; - HYPRE_Int xi; hypre_Index loop_size; hypre_Index start; @@ -263,7 +261,7 @@ x_data_box, start, stride, xi, PT_data_box, startc, stridec, PTi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,PTi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, PTi) { diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg2_setup_rap.c hypre-2.13.0/src/struct_ls/sparse_msg2_setup_rap.c --- hypre-2.11.2/src/struct_ls/sparse_msg2_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/sparse_msg2_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -201,11 +201,6 @@ HYPRE_Real *rap_cc, *rap_cw, *rap_cs; HYPRE_Real *rap_csw, *rap_cse; - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; - HYPRE_Int yOffsetA; HYPRE_Int xOffsetP; HYPRE_Int yOffsetP; @@ -397,14 +392,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]; iP1 = iP - yOffsetP; @@ -442,21 +437,20 @@ default: hypre_BoxGetSize(cgrid_box, loop_size); - hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, Pstart, stridePR, iP, R_dbox, Pstart, stridePR, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - yOffsetP - xOffsetP; rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1] + rb[iR] * a_csw[iAm1] + a_csw[iA] * pa[iP1]; @@ -548,11 +542,6 @@ HYPRE_Real *rap_ce, *rap_cn; HYPRE_Real *rap_cnw, *rap_cne; - - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; HYPRE_Int yOffsetA; HYPRE_Int xOffsetP; @@ -730,21 +719,20 @@ case 5: hypre_BoxGetSize(cgrid_box, loop_size); - hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, Pstart, stridePR, iP, R_dbox, Pstart, stridePR, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]; iP1 = iP + yOffsetP; @@ -773,21 +761,20 @@ default: hypre_BoxGetSize(cgrid_box, loop_size); - hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, Pstart, stridePR, iP, R_dbox, Pstart, stridePR, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - yOffsetA; - iAp1 = iA + yOffsetA; + HYPRE_Int iAm1 = iA - yOffsetA; + HYPRE_Int iAp1 = iA + yOffsetA; - iP1 = iP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + yOffsetP + xOffsetP; rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1] + ra[iR] * a_cne[iAp1] + a_cne[iA] * pb[iP1]; diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg3_setup_rap.c hypre-2.13.0/src/struct_ls/sparse_msg3_setup_rap.c --- hypre-2.11.2/src/struct_ls/sparse_msg3_setup_rap.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/sparse_msg3_setup_rap.c 2017-10-20 17:42:22.000000000 +0000 @@ -225,11 +225,6 @@ HYPRE_Real *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn; HYPRE_Real *rap_csw, *rap_cse; HYPRE_Real *rap_bsw, *rap_bse, *rap_bnw, *rap_bne; - - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; HYPRE_Int zOffsetA; HYPRE_Int xOffsetP; @@ -563,14 +558,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) - { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + { + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP; rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1]; iP1 = iP - zOffsetP - xOffsetP; @@ -633,14 +628,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]; iP1 = iP - zOffsetP - yOffsetP; @@ -737,14 +732,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP - zOffsetP - yOffsetP - xOffsetP; + HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP; rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1] + rb[iR] * a_bsw[iAm1] + a_bsw[iA] * pa[iP1]; @@ -898,11 +893,6 @@ HYPRE_Real *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an; HYPRE_Real *rap_cnw, *rap_cne; HYPRE_Real *rap_asw, *rap_ase, *rap_anw, *rap_ane; - - HYPRE_Int iA, iAm1, iAp1; - HYPRE_Int iAc; - HYPRE_Int iP, iP1; - HYPRE_Int iR; HYPRE_Int zOffsetA; HYPRE_Int xOffsetP; @@ -1221,21 +1211,20 @@ case 7: hypre_BoxGetSize(cgrid_box, loop_size); - hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size, P_dbox, Pstart, stridePR, iP, R_dbox, Pstart, stridePR, iR, A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP; rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1]; iP1 = iP + zOffsetP + xOffsetP; @@ -1289,14 +1278,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]; iP1 = iP + zOffsetP + yOffsetP; @@ -1385,14 +1374,14 @@ A_dbox, fstart, stridef, iA, RAP_dbox, cstart, stridec, iAc); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(iP, iR, iA, iAc) { - iAm1 = iA - zOffsetA; - iAp1 = iA + zOffsetA; + HYPRE_Int iAm1 = iA - zOffsetA; + HYPRE_Int iAp1 = iA + zOffsetA; - iP1 = iP + zOffsetP + yOffsetP + xOffsetP; + HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP; rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1] + ra[iR] * a_ane[iAp1] + a_ane[iA] * pb[iP1]; diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_filter.c hypre-2.13.0/src/struct_ls/sparse_msg_filter.c --- hypre-2.11.2/src/struct_ls/sparse_msg_filter.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/sparse_msg_filter.c 2017-10-20 17:42:22.000000000 +0000 @@ -325,22 +325,14 @@ hypre_Box *A_dbox; hypre_Box *v_dbox; - HYPRE_Int Ai; - HYPRE_Int vi; - - HYPRE_Real *Ap; HYPRE_Real *vxp; HYPRE_Real *vyp; HYPRE_Real *vzp; - HYPRE_Real lambdax; - HYPRE_Real lambday; - HYPRE_Real lambdaz; + hypre_StructStencil *stencil; hypre_Index *stencil_shape; - HYPRE_Int stencil_size; - - HYPRE_Int Astenc; + HYPRE_Int stencil_size; hypre_Index loop_size; hypre_Index cindex; @@ -349,7 +341,7 @@ hypre_Index stride; hypre_Index stridev; - HYPRE_Int i, si; + HYPRE_Int i; /*---------------------------------------------------------- * Initialize some things @@ -375,6 +367,9 @@ compute_boxes = hypre_StructGridBoxes(hypre_StructMatrixGrid(A)); hypre_ForBoxI(i, compute_boxes) { + + hypre_MatrixIndexMove(A, stencil_size, i, ierr,3); + compute_box = hypre_BoxArrayBox(compute_boxes, i); A_dbox = hypre_BoxArrayBox(hypre_StructMatrixDataSpace(A), i); @@ -392,20 +387,28 @@ A_dbox, start, stride, Ai, v_dbox, startv, stridev, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,vi,lambdax,lambday,lambdaz,si,Ap,Astenc) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Ai, vi) { + HYPRE_Real lambdax,lambday,lambdaz; + HYPRE_Real *Ap; + HYPRE_Int si,Astenc; + lambdax = 0.0; lambday = 0.0; lambdaz = 0.0; + for (si = 0; si < stencil_size; si++) { - Ap = hypre_StructMatrixBoxData(A, i, si); - + //Ap = hypre_StructMatrixBoxData(A, i, si); + //Ap = data_A + indices_d[si]; + Ap = hypre_StructGetMatrixBoxData(A, i, si); /* compute lambdax */ - Astenc = hypre_IndexD(stencil_shape[si], 0); + //Astenc = hypre_IndexD(stencil_shape[si], 0); + //Astenc = stencil_shape_d[si]; + Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]); if (Astenc == 0) { lambdax += Ap[Ai]; @@ -416,7 +419,10 @@ } /* compute lambday */ - Astenc = hypre_IndexD(stencil_shape[si], 1); + //Astenc = hypre_IndexD(stencil_shape[si], 1); + //Astenc = stencil_shape_d[stencil_size+si]; + Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]); + if (Astenc == 0) { lambday += Ap[Ai]; @@ -427,7 +433,9 @@ } /* compute lambdaz */ - Astenc = hypre_IndexD(stencil_shape[si], 2); + //Astenc = hypre_IndexD(stencil_shape[si], 2); + //Astenc = stencil_shape_d[2*stencil_size+si]; + Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]); if (Astenc == 0) { lambdaz += Ap[Ai]; @@ -447,6 +455,8 @@ vzp[vi] = lambdaz / (lambdax + lambday + lambdaz); } hypre_BoxLoop2End(Ai, vi); + + hypre_StructCleanIndexD(); } return ierr; @@ -472,9 +482,6 @@ hypre_Box *e_dbox; hypre_Box *v_dbox; - HYPRE_Int ei; - HYPRE_Int vi; - HYPRE_Real *ep; HYPRE_Real *vp; @@ -519,7 +526,7 @@ e_dbox, start, stride, ei, v_dbox, startv, stridev, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,vi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(ei, vi) { diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_interp.c hypre-2.13.0/src/struct_ls/sparse_msg_interp.c --- hypre-2.11.2/src/struct_ls/sparse_msg_interp.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/sparse_msg_interp.c 2017-10-20 17:42:22.000000000 +0000 @@ -58,7 +58,7 @@ hypre_Index stride, hypre_Index strideP ) { - hypre_SparseMSGInterpData *interp_data = (hypre_SparseMSGInterpData *)interp_vdata; + hypre_SparseMSGInterpData *interp_data = (hypre_SparseMSGInterpData *)interp_vdata; hypre_StructGrid *grid; hypre_StructStencil *stencil; @@ -131,10 +131,6 @@ hypre_Box *P_dbox; hypre_Box *xc_dbox; hypre_Box *e_dbox; - - HYPRE_Int Pi; - HYPRE_Int xci; - HYPRE_Int ei; HYPRE_Real *Pp0, *Pp1; HYPRE_Real *xcp; @@ -203,7 +199,7 @@ e_dbox, start, stride, ei, xc_dbox, startc, stridec, xci); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(ei, xci) { @@ -263,7 +259,7 @@ P_dbox, startP, strideP, Pi, e_dbox, start, stride, ei); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Pi,ei) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(Pi, ei) { diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_restrict.c hypre-2.13.0/src/struct_ls/sparse_msg_restrict.c --- hypre-2.11.2/src/struct_ls/sparse_msg_restrict.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/sparse_msg_restrict.c 2017-10-20 17:42:22.000000000 +0000 @@ -58,7 +58,7 @@ hypre_Index stride, hypre_Index strideR ) { - hypre_SparseMSGRestrictData *restrict_data = (hypre_SparseMSGRestrictData *)restrict_vdata; + hypre_SparseMSGRestrictData *restrict_data = (hypre_SparseMSGRestrictData *)restrict_vdata; hypre_StructGrid *grid; hypre_StructStencil *stencil; @@ -129,10 +129,6 @@ hypre_Box *R_dbox; hypre_Box *r_dbox; hypre_Box *rc_dbox; - - HYPRE_Int Ri; - HYPRE_Int ri; - HYPRE_Int rci; HYPRE_Real *Rp0, *Rp1; HYPRE_Real *rp, *rp0, *rp1; @@ -231,7 +227,7 @@ r_dbox, start, stride, ri, rc_dbox, startc, stridec, rci); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ri,ri,rci) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ri, ri, rci) { diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_setup.c hypre-2.13.0/src/struct_ls/sparse_msg_setup.c --- hypre-2.11.2/src/struct_ls/sparse_msg_setup.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_ls/sparse_msg_setup.c 2017-10-20 17:42:22.000000000 +0000 @@ -10,8 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - /****************************************************************************** * * @@ -24,23 +22,23 @@ #define GRID 0 -#define hypre_SparseMSGSetCIndex(cdir, cindex) \ -{\ - hypre_SetIndex3(cindex, 0, 0, 0);\ - hypre_IndexD(cindex, cdir) = 0;\ -} - -#define hypre_SparseMSGSetFIndex(cdir, findex) \ -{\ - hypre_SetIndex3(findex, 0, 0, 0);\ - hypre_IndexD(findex, cdir) = 1;\ -} - -#define hypre_SparseMSGSetStride(cdir, stride) \ -{\ - hypre_SetIndex3(stride, 1, 1, 1);\ - hypre_IndexD(stride, cdir) = 2;\ -} +#define hypre_SparseMSGSetCIndex(cdir, cindex) \ + { \ + hypre_SetIndex3(cindex, 0, 0, 0); \ + hypre_IndexD(cindex, cdir) = 0; \ + } + +#define hypre_SparseMSGSetFIndex(cdir, findex) \ + { \ + hypre_SetIndex3(findex, 0, 0, 0); \ + hypre_IndexD(findex, cdir) = 1; \ + } + +#define hypre_SparseMSGSetStride(cdir, stride) \ + { \ + hypre_SetIndex3(stride, 1, 1, 1); \ + hypre_IndexD(stride, cdir) = 2; \ + } /*-------------------------------------------------------------------------- * hypre_SparseMSGSetup @@ -52,7 +50,7 @@ hypre_StructVector *b, hypre_StructVector *x ) { - hypre_SparseMSGData *smsg_data = (hypre_SparseMSGData *)smsg_vdata; + hypre_SparseMSGData *smsg_data = (hypre_SparseMSGData *)smsg_vdata; MPI_Comm comm = (smsg_data -> comm); @@ -559,7 +557,7 @@ } } - data = hypre_SharedCTAlloc(HYPRE_Real, data_size); + data = hypre_DeviceCTAlloc(HYPRE_Real,data_size); (smsg_data -> data) = data; hypre_StructVectorInitializeData(t_a[0], data); diff -Nru hypre-2.11.2/src/struct_mv/assumed_part.c hypre-2.13.0/src/struct_mv/assumed_part.c --- hypre-2.11.2/src/struct_mv/assumed_part.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/assumed_part.c 2017-10-20 17:42:22.000000000 +0000 @@ -158,9 +158,7 @@ } count = 0; - hypre_BoxLoop0Begin(ndim, div); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, div); { box = hypre_BoxArrayBox(box_array, count); hypre_BoxLoopGetIndex(index); @@ -172,7 +170,7 @@ } count++; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); /* clean up */ for (i = 0; i < ndim; i++) @@ -1635,7 +1633,7 @@ HYPRE_Int i, d, p, q, r, myid; HYPRE_Int num_regions, in_regions, this_region, proc_count, proc_start; - HYPRE_Int adj_proc_id, extra, num_partitions, part_num; + HYPRE_Int adj_proc_id, extra, num_partitions; HYPRE_Int width; HYPRE_Int *proc_array, proc_array_count; @@ -1772,9 +1770,7 @@ hypre_SetIndex(stride, 1); hypre_BoxGetSize(part_box, loop_size); hypre_BoxSetExtents(part_dbox, stride, div); - hypre_BoxLoop1Begin(ndim, loop_size, part_dbox, start, stride, part_num); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop1For(part_num) + hypre_SerialBoxLoop1Begin(ndim, loop_size, part_dbox, start, stride, part_num); { /*convert the partition number to a processor number*/ if (part_num < (2*extra)) @@ -1795,7 +1791,7 @@ proc_ids[num_proc_ids] = adj_proc_id + proc_start; num_proc_ids++; } - hypre_BoxLoop1End(part_num); + hypre_SerialBoxLoop1End(part_num); } /*end of for each region loop*/ diff -Nru hypre-2.11.2/src/struct_mv/box.h hypre-2.13.0/src/struct_mv/box.h --- hypre-2.11.2/src/struct_mv/box.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/box.h 2017-10-20 17:42:22.000000000 +0000 @@ -170,329 +170,155 @@ * BoxLoop macros: *--------------------------------------------------------------------------*/ -#if 0 /* set to 0 to use the new box loops */ +#ifdef HYPRE_USE_RAJA +#define hypre_Reductioninit(local_result)\ +HYPRE_Real local_result;\ +local_result = 0.0; +//ReduceSum< cuda_reduce, HYPRE_Real> local_result(0.0); +#else +#define hypre_Reductioninit(local_result)\ +HYPRE_Real local_result;\ +local_result = 0.0; +#endif -#define HYPRE_BOX_PRIVATE hypre__nx,hypre__ny,hypre__nz,hypre__i,hypre__j,hypre__k +#if defined(HYPRE_MEMORY_GPU) -#define hypre_BoxLoopDeclareS(dbox, stride, sx, sy, sz) \ -HYPRE_Int sx = (hypre_IndexX(stride));\ -HYPRE_Int sy = (hypre_IndexY(stride)*hypre_BoxSizeX(dbox));\ -HYPRE_Int sz = (hypre_IndexZ(stride)*\ - hypre_BoxSizeX(dbox)*hypre_BoxSizeY(dbox)) - -#define hypre_BoxLoopDeclareN(loop_size) \ -HYPRE_Int hypre__i, hypre__j, hypre__k;\ -HYPRE_Int hypre__nx = hypre_IndexX(loop_size);\ -HYPRE_Int hypre__ny = hypre_IndexY(loop_size);\ -HYPRE_Int hypre__nz = hypre_IndexZ(loop_size);\ -HYPRE_Int hypre__mx = hypre__nx;\ -HYPRE_Int hypre__my = hypre__ny;\ -HYPRE_Int hypre__mz = hypre__nz;\ -HYPRE_Int hypre__dir, hypre__max;\ -HYPRE_Int hypre__div, hypre__mod;\ -HYPRE_Int hypre__block, hypre__num_blocks;\ -hypre__dir = 0;\ -hypre__max = hypre__nx;\ -if (hypre__ny > hypre__max)\ -{\ - hypre__dir = 1;\ - hypre__max = hypre__ny;\ -}\ -if (hypre__nz > hypre__max)\ -{\ - hypre__dir = 2;\ - hypre__max = hypre__nz;\ -}\ -hypre__num_blocks = hypre_NumThreads();\ -if (hypre__max < hypre__num_blocks)\ -{\ - hypre__num_blocks = hypre__max;\ +#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)\ +HYPRE_Int * indices_d;\ +HYPRE_Int indices_h[stencil_size];\ +HYPRE_Int * stencil_shape_d;\ +HYPRE_Int stencil_shape_h[size*stencil_size];\ +HYPRE_Complex * data_A = hypre_StructMatrixData(A);\ +indices_d = hypre_DeviceTAlloc(HYPRE_Int, stencil_size);\ +stencil_shape_d = hypre_DeviceTAlloc(HYPRE_Int, size*stencil_size);\ +for (HYPRE_Int ii = 0; ii < stencil_size; ii++)\ +{\ + HYPRE_Int jj = 0;\ + indices_h[ii] = hypre_StructMatrixDataIndices(A)[i][ii];\ + if (size > 1) cdir = 0;\ + stencil_shape_h[ii] = hypre_IndexD(stencil_shape[ii], cdir);\ + for (jj = 1;jj < size;jj++)\ + stencil_shape_h[jj*stencil_size+ii] = hypre_IndexD(stencil_shape[ii], jj);\ }\ -if (hypre__num_blocks > 0)\ -{\ - hypre__div = hypre__max / hypre__num_blocks;\ - hypre__mod = hypre__max % hypre__num_blocks;\ -} +hypre_DataCopyToData(indices_h,indices_d,HYPRE_Int,stencil_size);\ +hypre_DataCopyToData(stencil_shape_h,stencil_shape_d,HYPRE_Int,size*stencil_size);\ -#define hypre_BoxLoopSet(i, j, k) \ -i = 0;\ -j = 0;\ -k = 0;\ -hypre__nx = hypre__mx;\ -hypre__ny = hypre__my;\ -hypre__nz = hypre__mz;\ -if (hypre__num_blocks > 1)\ -{\ - if (hypre__dir == 0)\ - {\ - i = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\ - hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\ - }\ - else if (hypre__dir == 1)\ - {\ - j = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\ - hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\ - }\ - else if (hypre__dir == 2)\ - {\ - k = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\ - hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\ - }\ -} +#define hypre_StructGetMatrixBoxData(A, i, si) (data_A + indices_d[si]) -#define hypre_BoxLoopGetIndex(index) \ -index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k +#define hypre_StructGetIndexD(index,i,index_d) (index_d) -/* Use this before the For macros below to force only one block */ -#define hypre_BoxLoopSetOneBlock() hypre__num_blocks = 1 +#define hypre_StructCleanIndexD()\ +hypre_DeviceTFree(indices_d);\ +hypre_DeviceTFree(stencil_shape_d); -/* Use this to get the block iteration inside a BoxLoop */ -#define hypre_BoxLoopBlock() hypre__block +#define hypre_StructPreparePrint()\ +HYPRE_Int tot_size = num_values*hypre_BoxVolume(hypre_BoxArrayBox(data_space, hypre_BoxArraySize(box_array)-1));\ +data_host = hypre_CTAlloc(HYPRE_Complex, tot_size);\ +hypre_DataCopyFromData(data_host,data,HYPRE_Complex,tot_size); -/*-----------------------------------*/ +#define hypre_StructPostPrint() hypre_TFree(data_host) -#define hypre_BoxLoop0Begin(ndim, loop_size)\ -{\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop0For()\ - hypre__BoxLoop0For(hypre__i, hypre__j, hypre__k) -#define hypre__BoxLoop0For(i, j, k)\ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ - {\ - hypre_BoxLoopSet(i, j, k);\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ - {\ - for (i = 0; i < hypre__nx; i++)\ - { - -#define hypre_BoxLoop0End()\ - }\ - }\ - }\ - }\ -} - -/*-----------------------------------*/ +#else -#define hypre_BoxLoop1Begin(ndim, loop_size,\ - dbox1, start1, stride1, i1)\ -{\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop1For(i1)\ - hypre__BoxLoop1For(hypre__i, hypre__j, hypre__k, i1) -#define hypre__BoxLoop1For(i, j, k, i1)\ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ - {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ - {\ - for (i = 0; i < hypre__nx; i++)\ - { +#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size) +#define hypre_StructGetMatrixBoxData(A, i, si) hypre_StructMatrixBoxData(A,i,si) +#define hypre_StructGetIndexD(index,i,index_d) hypre_IndexD(index,i) +#define hypre_StructCleanIndexD() {;} +#define hypre_StructPreparePrint() data_host = data; +#define hypre_StructPostPrint() {;} -#define hypre_BoxLoop1End(i1)\ - i1 += hypre__sx1;\ - }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - }\ - }\ -} +#endif -/*-----------------------------------*/ - -#define hypre_BoxLoop2Begin(ndim,loop_size,\ - dbox1, start1, stride1, i1,\ - dbox2, start2, stride2, i2)\ +#define hypre_SerialBoxLoop0Begin(ndim, loop_size)\ {\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - HYPRE_Int hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop2For(i1, i2)\ - hypre__BoxLoop2For(hypre__i, hypre__j, hypre__k, i1, i2) -#define hypre__BoxLoop2For(i, j, k, i1, i2)\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopInit(ndim, loop_size);\ + hypre_BoxLoopSetOneBlock();\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ + zypre_BoxLoopSet();\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ {\ - for (i = 0; i < hypre__nx; i++)\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ { -#define hypre_BoxLoop2End(i1, i2)\ - i1 += hypre__sx1;\ - i2 += hypre__sx2;\ +#define hypre_SerialBoxLoop0End()\ }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - i2 += hypre__sy2 - hypre__nx*hypre__sx2;\ + zypre_BoxLoopInc1();\ + zypre_BoxLoopInc2();\ }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - i2 += hypre__sz2 - hypre__ny*hypre__sy2;\ - }\ }\ } -/*-----------------------------------*/ - -#define hypre_BoxLoop3Begin(ndim, loop_size,\ - dbox1, start1, stride1, i1,\ - dbox2, start2, stride2, i2,\ - dbox3, start3, stride3, i3)\ +#define hypre_SerialBoxLoop1Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1)\ {\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - HYPRE_Int hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\ - HYPRE_Int hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\ - hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop3For(i1, i2, i3)\ - hypre__BoxLoop3For(hypre__i, hypre__j, hypre__k, i1, i2, i3) -#define hypre__BoxLoop3For(i, j, k, i1, i2, i3)\ + HYPRE_Int i1;\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopSetOneBlock();\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\ - i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ {\ - for (i = 0; i < hypre__nx; i++)\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ { -#define hypre_BoxLoop3End(i1, i2, i3)\ - i1 += hypre__sx1;\ - i2 += hypre__sx2;\ - i3 += hypre__sx3;\ +#define hypre_SerialBoxLoop1End(i1)\ + i1 += hypre__i0inc1;\ }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - i2 += hypre__sy2 - hypre__nx*hypre__sx2;\ - i3 += hypre__sy3 - hypre__nx*hypre__sx3;\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + zypre_BoxLoopInc2();\ }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - i2 += hypre__sz2 - hypre__ny*hypre__sy2;\ - i3 += hypre__sz3 - hypre__ny*hypre__sy3;\ - }\ }\ } -/*-----------------------------------*/ - -#define hypre_BoxLoop4Begin(ndim, loop_size,\ - dbox1, start1, stride1, i1,\ - dbox2, start2, stride2, i2,\ - dbox3, start3, stride3, i3,\ - dbox4, start4, stride4, i4)\ +#define hypre_SerialBoxLoop2Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1,\ + dbox2, start2, stride2, i2)\ {\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - HYPRE_Int hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\ - HYPRE_Int hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\ - HYPRE_Int hypre__i4start = hypre_BoxIndexRank(dbox4, start4);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\ - hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\ - hypre_BoxLoopDeclareS(dbox4, stride4, hypre__sx4, hypre__sy4, hypre__sz4);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop4For(i1, i2, i3, i4)\ - hypre__BoxLoop4For(hypre__i, hypre__j, hypre__k, i1, i2, i3, i4) -#define hypre__BoxLoop4For(i, j, k, i1, i2, i3, i4)\ + HYPRE_Int i1,i2;\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + zypre_BoxLoopSetOneBlock();\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\ - i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\ - i4 = hypre__i4start + i*hypre__sx4 + j*hypre__sy4 + k*hypre__sz4;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ {\ - for (i = 0; i < hypre__nx; i++)\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ { -#define hypre_BoxLoop4End(i1, i2, i3, i4)\ - i1 += hypre__sx1;\ - i2 += hypre__sx2;\ - i3 += hypre__sx3;\ - i4 += hypre__sx4;\ +#define hypre_SerialBoxLoop2End(i1, i2)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - i2 += hypre__sy2 - hypre__nx*hypre__sx2;\ - i3 += hypre__sy3 - hypre__nx*hypre__sx3;\ - i4 += hypre__sy4 - hypre__nx*hypre__sx4;\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + zypre_BoxLoopInc2();\ }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - i2 += hypre__sz2 - hypre__ny*hypre__sy2;\ - i3 += hypre__sz3 - hypre__ny*hypre__sy3;\ - i4 += hypre__sz4 - hypre__ny*hypre__sy4;\ - }\ }\ } -/*-----------------------------------*/ - +#if defined (HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) +#define HYPRE_BOX_PRIVATE hypre__global_error #else - -#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE - -#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex -#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock -#define hypre_BoxLoopBlock zypre_BoxLoopBlock -#define hypre_BoxLoop0Begin zypre_BoxLoop0Begin -#define hypre_BoxLoop0For zypre_BoxLoop0For -#define hypre_BoxLoop0End zypre_BoxLoop0End -#define hypre_BoxLoop1Begin zypre_BoxLoop1Begin -#define hypre_BoxLoop1For zypre_BoxLoop1For -#define hypre_BoxLoop1End zypre_BoxLoop1End -#define hypre_BoxLoop2Begin zypre_BoxLoop2Begin -#define hypre_BoxLoop2For zypre_BoxLoop2For -#define hypre_BoxLoop2End zypre_BoxLoop2End -#define hypre_BoxLoop3Begin zypre_BoxLoop3Begin -#define hypre_BoxLoop3For zypre_BoxLoop3For -#define hypre_BoxLoop3End zypre_BoxLoop3End -#define hypre_BoxLoop4Begin zypre_BoxLoop4Begin -#define hypre_BoxLoop4For zypre_BoxLoop4For -#define hypre_BoxLoop4End zypre_BoxLoop4End - -#endif /* end if 1 */ - +#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE #endif - -/****************************************************************************** - * - * NEW BoxLoop STUFF - * - *****************************************************************************/ - -#ifndef hypre_ZBOX_HEADER -#define hypre_ZBOX_HEADER - #define ZYPRE_BOX_PRIVATE hypre__IN,hypre__JN,hypre__I,hypre__J,hypre__d,hypre__i -/*-------------------------------------------------------------------------- - * BoxLoop macros: - *--------------------------------------------------------------------------*/ - #define zypre_BoxLoopDeclare() \ HYPRE_Int hypre__tot, hypre__div, hypre__mod;\ HYPRE_Int hypre__block, hypre__num_blocks;\ @@ -629,6 +455,7 @@ #define zypre_BoxLoop1Begin(ndim, loop_size,\ dbox1, start1, stride1, i1)\ {\ + HYPRE_Int i1;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopInit(ndim, loop_size);\ @@ -637,6 +464,7 @@ #define zypre_BoxLoop1For(i1)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ @@ -660,6 +488,7 @@ dbox1, start1, stride1, i1,\ dbox2, start2, stride2, i2)\ {\ + HYPRE_Int i1,i2;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopDeclareK(2);\ @@ -670,6 +499,7 @@ #define zypre_BoxLoop2For(i1, i2)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1,i2;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ zypre_BoxLoopSetK(2, i2);\ @@ -697,6 +527,7 @@ dbox2, start2, stride2, i2,\ dbox3, start3, stride3, i3)\ {\ + HYPRE_Int i1,i2,i3;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopDeclareK(2);\ @@ -709,6 +540,7 @@ #define zypre_BoxLoop3For(i1, i2, i3)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1,i2,i3;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ zypre_BoxLoopSetK(2, i2);\ @@ -740,6 +572,7 @@ dbox3, start3, stride3, i3,\ dbox4, start4, stride4, i4)\ {\ + HYPRE_Int i1,i2,i3,i4;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopDeclareK(2);\ @@ -754,6 +587,7 @@ #define zypre_BoxLoop4For(i1, i2, i3, i4)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1,i2,i3,i4;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ zypre_BoxLoopSetK(2, i2);\ @@ -782,6 +616,32 @@ /*-----------------------------------*/ +#define zypre_BasicBoxLoopInitK(k, stridek) \ +hypre__sk##k[0] = stridek[0];\ +hypre__ikinc##k[0] = 0;\ +for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\ +{\ + hypre__sk##k[hypre__d] = stridek[hypre__d];\ + hypre__ikinc##k[hypre__d] = hypre__ikinc##k[hypre__d-1] +\ + hypre__sk##k[hypre__d] - hypre__n[hypre__d-1]*hypre__sk##k[hypre__d-1];\ +}\ +hypre__i0inc##k = hypre__sk##k[0];\ +hypre__ikinc##k[hypre__ndim] = 0;\ +hypre__ikstart##k = 0 + +#define zypre_BasicBoxLoop2Begin(ndim, loop_size,\ + stride1, i1,\ + stride2, i2)\ +{\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BasicBoxLoopInitK(1, stride1);\ + zypre_BasicBoxLoopInitK(2, stride2); + +/*-----------------------------------*/ + #endif diff -Nru hypre-2.11.2/src/struct_mv/boxloop_cuda.h hypre-2.13.0/src/struct_mv/boxloop_cuda.h --- hypre-2.11.2/src/struct_mv/boxloop_cuda.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/boxloop_cuda.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,717 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER + +#include +#include + +struct cuda_traversal {HYPRE_Int cuda;}; +struct omp_traversal {HYPRE_Int omp;}; +#define hypre_exec_policy cuda_traversal() +#define HYPER_LAMBDA [=] __device__ + +typedef struct hypre_Boxloop_struct +{ + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; +} hypre_Boxloop; + +#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__) +inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line) +{ + if (err) + { + printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line); + } +} +#define BLOCKSIZE 128 + +#define hypre_fence() \ + cudaError err = cudaGetLastError(); \ +if ( cudaSuccess != err )\ +{\ + printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__);\ +} \ +AxCheckError(cudaDeviceSynchronize()); + +extern "C++" { +template +__global__ void forall_kernel(LOOP_BODY loop_body, HYPRE_Int length) +{ + HYPRE_Int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < length) + loop_body(idx); +} + +template +void BoxLoopforall (cuda_traversal, HYPRE_Int length, LOOP_BODY loop_body) +{ + size_t const blockSize = 128; + size_t gridSize = (length + blockSize - 1) / blockSize; + if (gridSize == 0) gridSize = 1; + + //hypre_printf("length= %d, blocksize = %d, gridsize = %d\n",length,blockSize,gridSize); + forall_kernel<<>>(loop_body,length); +} + +template +void BoxLoopforall (omp_traversal, HYPRE_Int length, LOOP_BODY loop_body) +{ + +#pragma omp parallel for schedule(static) + for (HYPRE_Int idx = 0;idx < length;idx++) + loop_body(idx); +} + +#define zypre_BoxLoopIncK(k,box,i) \ +{ \ +HYPRE_Int idx = idx_local; \ +local_idx = idx % box.lsize0; \ +idx = idx / box.lsize0; \ +i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k; \ +hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ +local_idx = idx % box.lsize1; \ +idx = idx / box.lsize1; \ +i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k; \ +hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ +local_idx = idx % box.lsize2; \ +idx = idx / box.lsize2; \ +i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k; \ +hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); \ +} + + +template +__global__ void reduction_mult (T * a, T * b, HYPRE_Int hypre__tot, + hypre_Boxloop box1) +{ + HYPRE_Int id = (blockIdx.x * blockDim.x) + threadIdx.x; + HYPRE_Int local_idx; + HYPRE_Int idx_local = id; + HYPRE_Int hypre_boxD1 = 1; + HYPRE_Int i1 = 0; + //// reducted output + __shared__ T shared_cache [BLOCKSIZE]; + T sum = 1; + local_idx = idx_local % box1.lsize0; + idx_local = idx_local / box1.lsize0; + i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1; + hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1); + local_idx = idx_local % box1.lsize1; + idx_local = idx_local / box1.lsize1; + i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1; + hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1); + local_idx = idx_local % box1.lsize2; + idx_local = idx_local / box1.lsize2; + i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1; + hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1); + if (id < hypre__tot) + sum = a[i1]; + *(shared_cache + threadIdx.x) = sum; + + __syncthreads(); + + ///////// sum of internal cache + + HYPRE_Int i; + + for (i=(BLOCKSIZE /2); i>0 ; i= i/2){ + if (threadIdx.x < i){ + *(shared_cache + threadIdx.x) *= *(shared_cache + threadIdx.x + i); + } + __syncthreads(); + } + + if ( threadIdx.x == 0){ + *(b+ blockIdx.x) = shared_cache[0]; + } +} +} + +#define hypre_BoxLoopInit(ndim,loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int i = 0;i < ndim;i ++) \ + hypre__tot *= loop_size[i]; + + +#define hypre_newBoxLoopDeclare()\ + HYPRE_Int hypre__i,hypre__j,hypre__k;\ + HYPRE_Int idx_local = idx; + +#define hypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_newBoxLoop0End() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0]-dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1]-dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2]-dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +#define hypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); + +#define hypre_newBoxLoop1End(i1) \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + +#define hypre_newBoxLoop2End(i1, i2) \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop3Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (hypre__k*databox3.strides2 +databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + + +#define hypre_newBoxLoop3End(i1, i2,i3) \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop4Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + i4 += (hypre__i*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + i4 += (hypre__j*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (hypre__k*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + i4 += (hypre__k*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1); \ + +#define hypre_newBoxLoop4End(i1, i2, i3, i4) \ + }); \ + hypre_fence(); \ +} + +#define MAX_BLOCK 512 + +extern "C++" { +template +__inline__ __device__ +HYPRE_Int fake_shfl_down(T val, HYPRE_Int offset, HYPRE_Int width=32) { + static __shared__ T shared[MAX_BLOCK]; + HYPRE_Int lane=threadIdx.x%32; + + shared[threadIdx.x]=val; + __syncthreads(); + + val = (lane+offset +__inline__ __device__ +HYPRE_Real warpReduceSum (T val) { + for (HYPRE_Int offset = warpSize/2; offset > 0; offset /= 2) + val += __shfl_down(val,offset); + return val; +} + + +template +__inline__ __device__ +HYPRE_Real blockReduceSum(T val) { + static __shared__ T shared[32]; + HYPRE_Int lane=threadIdx.x%warpSize; + HYPRE_Int wid=threadIdx.x/warpSize; + val=warpReduceSum(val); + + //write reduced value to shared memory + if(lane==0) shared[wid]=val; + __syncthreads(); + + //ensure we only grab a value from shared memory if that warp existed + val = (threadIdx.x(val); + + return val; +} + +template +__global__ void hypre_device_reduce_stable_kernel(T*a, T*b, T* out, HYPRE_Int N, + hypre_Boxloop box1,hypre_Boxloop box2) { + HYPRE_Int local_idx; + HYPRE_Int idx_local; + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; + HYPRE_Int i1 = 0, i2 = 0; + T sum=T(0); + HYPRE_Int i; + + for(i=blockIdx.x*blockDim.x+threadIdx.x;i(sum); + if(threadIdx.x==0) + out[blockIdx.x]=sum; +} + +template +__global__ void hypre_device_reduce_stable_kernel2(T *in, T* out, HYPRE_Int N) { + T sum=T(0); + for(HYPRE_Int i=blockIdx.x*blockDim.x+threadIdx.x;i(sum); + if(threadIdx.x==0) + out[blockIdx.x]=sum; +} + +template +void hypre_device_reduce_stable(T*a,T*b, T* out, HYPRE_Int N, + hypre_Boxloop box1,hypre_Boxloop box2) { + HYPRE_Int threads=512; + HYPRE_Int blocks=min((N+threads-1)/threads,1024); + + hypre_device_reduce_stable_kernel<<>>(a,b,out,N,box1,box2); + hypre_device_reduce_stable_kernel2<<<1,1024>>>(out,out,blocks); +} + +} + +extern "C++" { +template +__global__ void hypre_device_reduction_kernel(HYPRE_Real* out, + HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2, + LOOP_BODY loop_body) +{ + HYPRE_Int local_idx; + HYPRE_Int idx_local; + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; + HYPRE_Int i1 = 0, i2 = 0; + HYPRE_Real sum = HYPRE_Real(0); + HYPRE_Int i; + + for(i=blockIdx.x*blockDim.x+threadIdx.x;i(sum); + if(threadIdx.x==0) + out[blockIdx.x]=sum; +} + +template +void hypre_device_reduction (HYPRE_Real* out, + HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2, + LOOP_BODY loop_body) +{ + HYPRE_Int threads=512; + HYPRE_Int blocks=min((N+threads-1)/threads,1024); + + hypre_device_reduction_kernel<<>>(out,N,box1,box2,loop_body); + hypre_device_reduce_stable_kernel2<<<1,1024>>>(out,out,blocks); + +} +} + +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, sum) \ +{ \ + HYPRE_Real sum_old = sum; \ + sum = 0.0; \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + HYPRE_Real *d_c; \ + cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real)); \ + hypre_device_reduction(d_c,hypre__tot,databox1,databox1,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \ + { + +#define hypre_newBoxLoop1ReductionEnd(i1, sum) \ + return sum; \ + }); \ + cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost); \ + sum += sum_old; \ + cudaFree(d_c); \ +} + +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2,sum) \ +{ \ + HYPRE_Real sum_old = sum; \ + sum = 0.0; \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + HYPRE_Real *d_c; \ + cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real)); \ + hypre_device_reduction(d_c,hypre__tot,databox1,databox2,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \ + { + +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum) \ + return sum; \ + }); \ + cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost); \ + sum += sum_old; \ + cudaFree(d_c); \ +} + + + +#define hypre_newBoxLoop1ReductionMult(ndim, loop_size, \ + dbox1, start1, stride1, i1,xp,sum) \ +{ \ + HYPRE_Real sum_old = sum;\ + sum = 1.0;\ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + HYPRE_Int n_blocks = (hypre__tot+BLOCKSIZE-1)/BLOCKSIZE; \ + HYPRE_Real *d_b; \ + HYPRE_Real * b = new HYPRE_Real[n_blocks]; \ + cudaMalloc((void**) &d_b, n_blocks * sizeof(HYPRE_Real)); \ + reduction_mult<<< n_blocks ,BLOCKSIZE>>>(xp,d_b,hypre__tot,databox1); \ + hypre_fence(); \ + for (HYPRE_Int j = 0 ; j< n_blocks ; ++j){ \ + sum *= b[j]; \ + } \ + delete [] b; \ + sum *=sum_old;\ +} + +#define hypre_LoopBegin(size,idx) \ +{ \ + BoxLoopforall(hypre_exec_policy,size,HYPER_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_LoopEnd() \ + }); \ + hypre_fence();\ +} + +#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += hypre__i*databox1.strides0; \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += hypre__j*databox1.strides1; \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += hypre__k*databox1.strides2; \ + +#define hypre_BoxBoundaryCopyEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1,databox2; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0, i2 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += hypre__i*databox1.strides0; \ + i2 += hypre__i*databox2.strides0; \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += hypre__j*databox1.strides1; \ + i2 += hypre__j*databox2.strides1; \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += hypre__k*databox1.strides2; \ + i2 += hypre__k*databox2.strides2; + + +#define hypre_BoxDataExchangeEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop0For() + +#define hypre_newBoxLoop1For(i1) + +#define hypre_newBoxLoop2For(i1, i2) + +#define hypre_newBoxLoop3For(i1, i2, i3) + +#define hypre_newBoxLoop4For(i1, i2, i3, i4) + +#define hypre_newBoxLoopGetIndex(index) \ + index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock() ; +#define hypre_BoxLoopBlock() 0 + +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0For hypre_newBoxLoop0For +#define hypre_BoxLoop0End hypre_newBoxLoop0End +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1For hypre_newBoxLoop1For +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin +#define hypre_BoxLoop2For hypre_newBoxLoop2For +#define hypre_BoxLoop2End hypre_newBoxLoop2End +#define hypre_BoxLoop3Begin hypre_newBoxLoop3Begin +#define hypre_BoxLoop3For hypre_newBoxLoop3For +#define hypre_BoxLoop3End hypre_newBoxLoop3End +#define hypre_BoxLoop4Begin hypre_newBoxLoop4Begin +#define hypre_BoxLoop4For hypre_newBoxLoop4For +#define hypre_BoxLoop4End hypre_newBoxLoop4End +#endif diff -Nru hypre-2.11.2/src/struct_mv/boxloop.h hypre-2.13.0/src/struct_mv/boxloop.h --- hypre-2.11.2/src/struct_mv/boxloop.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/boxloop.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,388 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER + +#ifdef HYPRE_USING_OPENMP +#ifdef WIN32 +#define Pragma(x) __pragma(#x) +#else +#define Pragma(x) _Pragma(#x) +#endif +#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_SMP_SCHEDULE) +#define OMPREDUCTION() Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) +#else +#define OMP1 +#define OMPREDUCTION() ; +#endif + +typedef struct hypre_Boxloop_struct + { + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; + }hypre_Boxloop; + +#define zypre_newBoxLoop0Begin(ndim, loop_size) \ +{\ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopInit(ndim, loop_size); \ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ + {\ + zypre_BoxLoopSet();\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop0End()\ + }\ + zypre_BoxLoopInc1();\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define zypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ + { \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop1End(i1) \ + i1 += hypre__i0inc1; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + + +#define zypre_newBoxLoop2Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop2End(i1, i2)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + + +#define zypre_newBoxLoop3Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopDeclareK(3); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + zypre_BoxLoopSetK(3, i3);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop3End(i1, i2, i3)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + i3 += hypre__i0inc3;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + i3 += hypre__ikinc3[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define zypre_newBoxLoop4Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1,\ + dbox2, start2, stride2, i2,\ + dbox3, start3, stride3, i3,\ + dbox4, start4, stride4, i4)\ +{\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopDeclareK(3);\ + zypre_BoxLoopDeclareK(4);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);\ + zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);\ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + zypre_BoxLoopSetK(3, i3);\ + zypre_BoxLoopSetK(4, i4);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop4End(i1, i2, i3, i4)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + i3 += hypre__i0inc3;\ + i4 += hypre__i0inc4;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + i3 += hypre__ikinc3[hypre__d];\ + i4 += hypre__ikinc4[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + sum) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + OMPREDUCTION() \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define hypre_newBoxLoop1ReductionEnd(i1, sum)\ + i1 += hypre__i0inc1;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + sum) \ +{\ + HYPRE_Int i1,i2; \ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + OMPREDUCTION() \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define hypre_LoopBegin(size,idx) \ +{ \ + HYPRE_Int idx; \ + for (idx = 0;idx < size;idx ++) \ + { + +#define hypre_LoopEnd() \ + } \ +} + +#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1; \ + HYPRE_Int d,idx; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + for (d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + for (idx = 0;idx < hypre__tot;idx++) \ + { \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + + +#define hypre_BoxBoundaryCopyEnd() \ + } \ +} + +#define hypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + HYPRE_Int hypre__tot = 1,idx; \ + hypre_Boxloop databox1,databox2; \ + HYPRE_Int d; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + for (d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + for (idx = 0;idx < hypre__tot;idx++) \ + { \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + i2 += local_idx*databox2.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + i2 += local_idx*databox2.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + i2 += local_idx*databox2.strides2; + +#define hypre_BoxDataExchangeEnd() \ + } \ +} + +#define hypre_newBoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock +#define hypre_BoxLoopBlock zypre_BoxLoopBlock +#define hypre_BoxLoop0Begin zypre_BoxLoop0Begin +#define hypre_BoxLoop0For zypre_BoxLoop0For +#define hypre_BoxLoop0End zypre_BoxLoop0End +#define hypre_BoxLoop1Begin zypre_BoxLoop1Begin +#define hypre_BoxLoop1For zypre_BoxLoop1For +#define hypre_BoxLoop1End zypre_BoxLoop1End +#define hypre_BoxLoop2Begin zypre_BoxLoop2Begin +#define hypre_BoxLoop2For zypre_BoxLoop2For +#define hypre_BoxLoop2End zypre_BoxLoop2End +#define hypre_BoxLoop3Begin zypre_BoxLoop3Begin +#define hypre_BoxLoop3For zypre_BoxLoop3For +#define hypre_BoxLoop3End zypre_BoxLoop3End +#define hypre_BoxLoop4Begin zypre_BoxLoop4Begin +#define hypre_BoxLoop4For zypre_BoxLoop4For +#define hypre_BoxLoop4End zypre_BoxLoop4End +#define hypre_BasicBoxLoop2Begin zypre_BasicBoxLoop2Begin + +#endif diff -Nru hypre-2.11.2/src/struct_mv/boxloop_kokkos.h hypre-2.13.0/src/struct_mv/boxloop_kokkos.h --- hypre-2.11.2/src/struct_mv/boxloop_kokkos.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/boxloop_kokkos.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,542 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER +extern "C++" { +#include +} +#if defined( KOKKOS_HAVE_MPI ) +#include +#endif + + typedef struct hypre_Boxloop_struct + { + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; + } hypre_Boxloop; + + #if defined(HYPRE_MEMORY_GPU) + #include + #include + #define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__) + inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line) + { + if (err) + { + printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line); + } + } + #define BLOCKSIZE 256 + + #define hypre_fence() \ + cudaError err = cudaGetLastError();\ + if ( cudaSuccess != err ) {\ + printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \ + }\ + AxCheckError(cudaDeviceSynchronize()); + #elif defined(HYPRE_USE_OPENMP) + #define hypre_fence() ; + #elif defined(HYPRE_USING_OPENMP_ACC) + #define hypre_fence() + #else + #define hypre_fence(); + #endif + + #define hypre_newBoxLoopInit(ndim,loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int i = 0;i < ndim;i ++) \ + hypre__tot *= loop_size[i]; + + + #define hypre_BoxLoopIncK(k,box,i) \ + { \ + HYPRE_Int idx = idx_local; \ + local_idx = idx % box.lsize0; \ + idx = idx / box.lsize0; \ + i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + local_idx = idx % box.lsize1; \ + idx = idx / box.lsize1; \ + i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + local_idx = idx % box.lsize2; \ + idx = idx / box.lsize2; \ + i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); \ + } + + #define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0]-dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1]-dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2]-dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + + #define hypre_newBoxLoopDeclare() \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; + + #define hypre_newBoxLoop0Begin(ndim, loop_size) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { + + + #define hypre_newBoxLoop0End(i1) \ + }); \ + } + + + #define hypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size) \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); + + + #define hypre_newBoxLoop1End(i1) \ + }); \ + hypre_fence(); \ + } + + + #define hypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + + #define hypre_newBoxLoop2End(i1, i2) \ + }); \ + hypre_fence(); \ + } + + + #define hypre_newBoxLoop3Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 +databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); + + #define hypre_newBoxLoop3End(i1, i2, i3) \ + }); \ + hypre_fence(); \ + } + + #define hypre_newBoxLoop4Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1); \ + + + #define hypre_newBoxLoop4End(i1, i2, i3, i4) \ + }); \ + hypre_fence(); \ + } + + #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, sum) \ + { \ + HYPRE_Real sum_tmp = sum; \ + sum = 0; \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + + + + #define hypre_newBoxLoop1ReductionEnd(i1, sum) \ + },sum); \ + hypre_fence(); \ + sum += sum_tmp; \ + } + + #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, sum) \ + { \ + HYPRE_Real sum_tmp = sum; \ + sum = 0.0; \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + + + #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum) \ + },sum); \ + hypre_fence(); \ + sum +=sum_tmp; \ + } + + #define hypre_newBoxLoop1ReductionMult(ndim, loop_size, \ + dbox1, start1, stride1, i1, xp, sum) \ + { \ + HYPRE_Real sum_tmp = sum; \ + sum = 1.0; \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + sum *= xp[i1]; \ + },sum); \ + hypre_fence(); \ + sum *=sum_tmp; \ +} + + +#define hypre_LoopBegin(size,idx) \ +{ \ + Kokkos::parallel_for(size, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_LoopEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + +#define hypre_BoxBoundaryCopyEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1,databox2; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + i2 += local_idx*databox2.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + i2 += local_idx*databox2.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + i2 += local_idx*databox2.strides2; + + + +#define hypre_BoxDataExchangeEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_newBoxLoop0For() + +#define zypre_newBoxLoop1For(i1) + +#define zypre_newBoxLoop2For(i1, i2) + +#define zypre_newBoxLoop3For(i1, i2, i3) + +#define zypre_newBoxLoop4For(i1, i2, i3, i4) + +#define hypre_newBoxLoopSetOneBlock() {} + +#define hypre_newBoxLoopGetIndex(index) \ + index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock hypre_newBoxLoopSetOneBlock +#define hypre_BoxLoopBlock() 0 +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0For hypre_newBoxLoop0For +#define hypre_BoxLoop0End hypre_newBoxLoop0End +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1For hypre_newBoxLoop1For +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin +#define hypre_BoxLoop2For hypre_newBoxLoop2For +#define hypre_BoxLoop2End hypre_newBoxLoop2End +#define hypre_BoxLoop3Begin hypre_newBoxLoop3Begin +#define hypre_BoxLoop3For hypre_newBoxLoop3For +#define hypre_BoxLoop3End hypre_newBoxLoop3End +#define hypre_BoxLoop4Begin hypre_newBoxLoop4Begin +#define hypre_BoxLoop4For hypre_newBoxLoop4For +#define hypre_BoxLoop4End hypre_newBoxLoop4End + +//#define hypre_newBoxLoop1ReductionBegin hypre_newBoxLoop1ReductionBegin +//#define hypre_newBoxLoop1ReductionEnd hypre_newBoxLoop1ReductionEnd +//#define hypre_newBoxLoop2ReductionBegin hypre_newBoxLoop2ReductionBegin +//#define hypre_newBoxLoop2ReductionEnd hypre_newBoxLoop2ReductionEnd +//#define hypre_newBoxLoop1ReductionMult hypre_newBoxLoop1ReductionMult +//#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin +//#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd +//#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin +//#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd + +#endif diff -Nru hypre-2.11.2/src/struct_mv/boxloop_raja.h hypre-2.13.0/src/struct_mv/boxloop_raja.h --- hypre-2.11.2/src/struct_mv/boxloop_raja.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/boxloop_raja.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,845 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER + +extern "C++" { +#include +} +using namespace RAJA; + +typedef struct hypre_Boxloop_struct +{ + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; +} hypre_Boxloop; + +#define BLOCKSIZE 256 + +#if defined(HYPRE_MEMORY_GPU) +#include +#include + +#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__) +inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line) +{ + if (err) + { + printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line); + HYPRE_Int *p = NULL; *p = 1; + } +} + +#define hypre_exec_policy cuda_exec +#define hypre_reduce_policy cuda_reduce_atomic +#define hypre_fence() \ +cudaError err = cudaGetLastError();\ +if ( cudaSuccess != err ) {\ +printf("\n ERROR zypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \ +}\ +AxCheckError(cudaDeviceSynchronize()); + +#elif defined(HYPRE_USE_OPENMP) + #define hypre_exec_policy omp_for_exec + #define hypre_reduce_policy omp_reduce + #define hypre_fence() +#elif defined(HYPRE_USING_OPENMP_ACC) + #define hypre_exec_policy omp_parallel_for_acc + #define hypre_reduce_policy omp_acc_reduce +#else + #define hypre_exec_policy seq_exec + #define hypre_reduce_policy seq_reduce + #define hypre_fence() +#endif + +#define zypre_BoxLoopIncK(k,box,i) \ +{ \ + HYPRE_Int idx = idx_local; \ + local_idx = idx % box.lsize0; \ + idx = idx / box.lsize0; \ + i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + local_idx = idx % box.lsize1; \ + idx = idx / box.lsize1; \ + i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + local_idx = idx % box.lsize2; \ + idx = idx / box.lsize2; \ + i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); \ +} + + +#define zypre_BoxLoopCUDAInit(ndim,loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int i = 0;i < ndim;i ++) \ + hypre__tot *= loop_size[i]; + + +#define zypre_BoxLoopCUDADeclare() \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; + +#define zypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { + + +#define zypre_newBoxLoop0End() \ + }); \ + hypre_fence(); \ +} + +#define zypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0]-dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1]-dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2]-dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +#define zypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare(); \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + zypre_BoxLoopIncK(1,databox1,i1); + + +#define zypre_newBoxLoop1End(i1) \ + }); \ + hypre_fence();\ +} + +#define zypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + + + +#define zypre_newBoxLoop2End(i1, i2) \ + }); \ + hypre_fence();\ +} + +#define zypre_newBoxLoop3Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ + { \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + + +#define zypre_newBoxLoop3End(i1, i2, i3) \ + }); \ + hypre_fence(); \ +} + +#define zypre_newBoxLoop4Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + zypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1); \ + +#define zypre_newBoxLoop4End(i1, i2, i3, i4) \ + }); \ + hypre_fence(); \ +} + +#define MAX_BLOCK BLOCKSIZE + +extern "C++" { +#if defined(HYPRE_MEMORY_GPU) +template +class ReduceMult +{ +public: + /*! + * \brief Constructor takes initial reduction value (default constructor + * is disabled). + * + * Note: Constructor only executes on the host. + */ + explicit ReduceMult(T init_val) + { + m_is_copy_host = false; + m_myID = getCudaReductionId(); + getCudaReductionTallyBlock(m_myID, + (void **)&m_tally_host, + (void **)&m_tally_device); + m_tally_host->tally = init_val; + } + + /*! + * \brief Initialize shared memory on device, request shared memory on host. + * + * Copy constructor executes on both host and device. + * On host requests dynamic shared memory and gets offset into dynamic + * shared memory if in forall. + * On device initializes dynamic shared memory to appropriate value. + */ + RAJA_HOST_DEVICE + ReduceMult(const ReduceMult &other) + { + *this = other; +#if defined(__CUDA_ARCH__) + m_is_copy_device = true; + m_finish_reduction = !other.m_is_copy_device; + extern __shared__ unsigned char sd_block[]; + T *sd = reinterpret_cast(&sd_block[m_smem_offset]); + + HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + // initialize shared memory + T val = static_cast(0); + for (HYPRE_Int i = BLOCKSIZE / 2; i > 0; i /= 2) { + // this descends all the way to 1 + if (threadId < i) { + sd[threadId + i] = val; + } + } + if (threadId < 1) { + sd[threadId] = val; + } + + __syncthreads(); +#else + m_is_copy_host = true; + m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCKSIZE, sizeof(T)); +#endif + } + + /*! + * \brief Finish reduction on device and free memory on host. + * + * Destruction on host releases the device memory chunk for + * reduction id and id itself for others to use. + * Destruction on device completes the reduction. + * + * Note: destructor executes on both host and device. + */ + RAJA_HOST_DEVICE ~ReduceMult() + { +#if defined(__CUDA_ARCH__) + if (m_finish_reduction) { + extern __shared__ unsigned char sd_block[]; + T *sd = reinterpret_cast(&sd_block[m_smem_offset]); + + HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + T temp = 1; + __syncthreads(); + + for (HYPRE_Int i = BLOCKSIZE / 2; i >= WARP_SIZE; i /= 2) { + if (threadId < i) { + sd[threadId] *= sd[threadId + i]; + } + __syncthreads(); + } + + if (threadId < WARP_SIZE) { + temp = sd[threadId]; + for (HYPRE_Int i = WARP_SIZE / 2; i > 0; i /= 2) { + temp *= HIDDEN::shfl_xor(temp, i); + } + } + + // one thread adds to tally + if (threadId == 0) { + _atomicAdd(&(m_tally_device->tally), temp); + } + } +#else + if (!m_is_copy_host) { + releaseCudaReductionTallyBlock(m_myID); + releaseCudaReductionId(m_myID); + } +#endif + + + } + + /*! + * \brief Operator that returns reduced sum value. + * + * Note: accessor only executes on host. + */ + operator T() + { + beforeCudaReadTallyBlock(m_myID); + return m_tally_host->tally; + } + + /*! + * \brief Operator that returns reduced sum value. + * + * Note: accessor only executes on host. + */ + T get() { return operator T(); } + + /*! + * \brief Operator that adds value to sum. + * + * Note: only operates on device. + */ + RAJA_DEVICE ReduceMult const & + operator*=(T val) const + { + extern __shared__ unsigned char sd_block[]; + T *sd = reinterpret_cast(&sd_block[m_smem_offset]); + + HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + sd[threadId] *= val; + + return *this; + } + +private: + /*! + * \brief Default constructor is declared private and not implemented. + */ + ReduceMult(); + + /*! + * \brief Pointer to host tally block cache slot for this reduction variable. + */ + CudaReductionTallyTypeAtomic *m_tally_host = nullptr; + + /*! + * \brief Pointer to device tally block slot for this reduction variable. + */ + CudaReductionTallyTypeAtomic *m_tally_device = nullptr; + + /*! + * \brief My cuda reduction variable ID. + */ + HYPRE_Int m_myID = -1; + + /*! + * \brief Byte offset into dynamic shared memory. + */ + HYPRE_Int m_smem_offset = -1; + + /*! + * \brief If this variable is a copy or not; only original may release memory + * or perform finalization. + */ + bool m_is_copy_host = false; + bool m_is_copy_device = false; + bool m_finish_reduction = false; + + // Sanity checks for block size and template type size + static constexpr bool powerOfTwoCheck = (!(BLOCKSIZE & (BLOCKSIZE - 1))); + static constexpr bool reasonableRangeCheck = + ((BLOCKSIZE >= 32) && (BLOCKSIZE <= 1024)); + static constexpr bool sizeofcheck = + ((sizeof(T) <= sizeof(CudaReductionDummyDataType)) + && (sizeof(CudaReductionTallyType) + <= sizeof(CudaReductionDummyTallyType)) + && (sizeof(CudaReductionBlockType) + <= sizeof(CudaReductionDummyBlockType))); + static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2"); + static_assert(reasonableRangeCheck, + "Error: block sizes must be between 32 and 1024"); + static_assert(sizeofcheck, + "Error: type must be of size <= " + RAJA_STRINGIFY_MACRO(RAJA_CUDA_REDUCE_VAR_MAXSIZE)); +}; +#elif defined(HYPRE_USING_OPENMP) + template + class ReduceMult + { + using my_type = ReduceMult; + + public: + // + // Constructor takes default value (default ctor is disabled). + // + explicit ReduceMult(T init_val, T initializer = 1) + : m_parent(NULL), m_val(init_val), m_custom_init(initializer) + { + } + + // + // Copy ctor. + // + ReduceMult(const ReduceMult& other) : + m_parent(other.m_parent ? other.m_parent : &other), + m_val(other.m_custom_init), + m_custom_init(other.m_custom_init) + { + } + + // + // Destruction releases the shared memory block chunk for reduction id + // and id itself for others to use. + // + ~ReduceMult() + { + if (m_parent) { +#pragma omp critical + { + *m_parent *= m_val; + } + } + } + + // + // Operator that returns reduced sum value. + // + operator T() + { + return m_val; + } + + // + // Method that returns sum value. + // + T get() { return operator T(); } + + // + // += operator that adds value to sum for current thread. + // + const ReduceMult& operator*=(T rhs) const + { + this->m_val *= rhs; + return *this; + } + + ReduceMult& operator*=(T rhs) + { + this->m_val *= rhs; + return *this; + } + + private: + // + // Default ctor is declared private and not implemented. + // + ReduceMult(); + + const my_type * m_parent; + + mutable T m_val; + T m_custom_init; + + }; +#else + template + class ReduceMult + { + using my_type = ReduceMult; + + public: + // + // Constructor takes default value (default ctor is disabled). + // + explicit ReduceMult(T init_m_val, T initializer = 1) : + m_parent(NULL), + m_val(init_m_val), + m_custom_init(initializer) + { + } + + // + // Copy ctor. + // + ReduceMult(const ReduceMult& other) : + m_parent(other.m_parent ? other.m_parent : &other), + m_val(other.m_custom_init), + m_custom_init(other.m_custom_init) + { + } + + // + // Destruction releases the shared memory block chunk for reduction id + // and id itself for others to use. + // + ~ReduceMult() + { + if (m_parent) { + *m_parent *= m_val; + } + } + + // + // Operator that returns reduced sum value. + // + operator T() + { + return m_val; + } + + // + // Method that returns reduced sum value. + // + T get() { return operator T(); } + + // + // += operator that adds value to sum. + // + ReduceMult& operator*=(T rhs) + { + this->m_val *= rhs; + return *this; + } + + const ReduceMult& operator*=(T rhs) const + { + this->m_val *= rhs; + return *this; + } + + private: + // + // Default ctor is declared private and not implemented. + // + ReduceMult(); + + const my_type * m_parent; + + mutable T m_val; + T m_custom_init; + }; +#endif +} + + +#define zypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1,sum) \ +{ \ + HYPRE_Real sum_tmp; \ + { \ + ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0); \ + zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1,i1) \ + { + +#define zypre_newBoxLoop1ReductionEnd(i1,sum) \ + } \ + zypre_newBoxLoop1End(i1); \ + hypre_fence(); \ + sum_tmp = (HYPRE_Real)(sum); \ + } \ + sum += sum_tmp; \ +} + +#define zypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2,sum) \ +{ \ + HYPRE_Real sum_tmp; \ + { \ + ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0); \ + zypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1,i1,\ + dbox2, start2, stride2,i2) \ + { + +#define zypre_newBoxLoop2ReductionEnd(i1,i2,sum) \ + } \ + zypre_newBoxLoop2End(i1,i2); \ + hypre_fence(); \ + sum_tmp = (HYPRE_Real)(sum); \ + } \ + sum += sum_tmp; \ +} + +#define zypre_newBoxLoop1ReductionMult(ndim, loop_size, \ + dbox1, start1, stride1, i1,xp,sum) \ +{ \ + ReduceMult local_result_raja(1.0); \ + zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ + { \ + local_result_raja *= xp[i1]; \ + } \ + zypre_newBoxLoop1End(i1) \ + hypre_fence(); \ + sum *= (HYPRE_Real)(local_result_raja); \ +} + + +#define hypre_LoopBegin(size,idx) \ +{ \ + forall< hypre_exec_policy >(0, size, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { + +#define hypre_LoopEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + hypre_Boxloop databox1; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare() \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + +#define zypre_BoxBoundaryCopyEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + hypre_Boxloop databox1,databox2; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare() \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + i2 += local_idx*databox2.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + i2 += local_idx*databox2.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + i2 += local_idx*databox2.strides2; + + + +#define zypre_BoxDataExchangeEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_newBoxLoop0For() + +#define zypre_newBoxLoop1For(i1) + +#define zypre_newBoxLoop2For(i1, i2) + +#define zypre_newBoxLoop3For(i1, i2, i3) + +#define zypre_newBoxLoop4For(i1, i2, i3, i4) + +#define zypre_newBoxLoopSetOneBlock() + +#define hypre_newBoxLoopGetIndex(index) \ + index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock zypre_newBoxLoopSetOneBlock +#define hypre_BoxLoopBlock() 0 +#define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin +#define hypre_BoxLoop0For zypre_newBoxLoop0For +#define hypre_BoxLoop0End zypre_newBoxLoop0End +#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin +#define hypre_BoxLoop1For zypre_newBoxLoop1For +#define hypre_BoxLoop1End zypre_newBoxLoop1End +#define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin +#define hypre_BoxLoop2For zypre_newBoxLoop2For +#define hypre_BoxLoop2End zypre_newBoxLoop2End +#define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin +#define hypre_BoxLoop3For zypre_newBoxLoop3For +#define hypre_BoxLoop3End zypre_newBoxLoop3End +#define hypre_BoxLoop4Begin zypre_newBoxLoop4Begin +#define hypre_BoxLoop4For zypre_newBoxLoop4For +#define hypre_BoxLoop4End zypre_newBoxLoop4End + +#define hypre_newBoxLoop1ReductionBegin zypre_newBoxLoop1ReductionBegin +#define hypre_newBoxLoop1ReductionEnd zypre_newBoxLoop1ReductionEnd +#define hypre_newBoxLoop2ReductionBegin zypre_newBoxLoop2ReductionBegin +#define hypre_newBoxLoop2ReductionEnd zypre_newBoxLoop2ReductionEnd +#define hypre_newBoxLoop1ReductionMult zypre_newBoxLoop1ReductionMult +#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin +#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd +#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin +#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd +#endif diff -Nru hypre-2.11.2/src/struct_mv/box_manager.c hypre-2.13.0/src/struct_mv/box_manager.c --- hypre-2.11.2/src/struct_mv/box_manager.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/box_manager.c 2017-10-20 17:42:22.000000000 +0000 @@ -2226,7 +2226,7 @@ HYPRE_Int size[HYPRE_MAXDIM]; HYPRE_Int iminmax[2]; HYPRE_Int index_not_there; - HYPRE_Int d, e, ii, itsize; + HYPRE_Int d, e, itsize; HYPRE_Int mystart, myfinish; HYPRE_Int imin[HYPRE_MAXDIM]; HYPRE_Int imax[HYPRE_MAXDIM]; @@ -2388,9 +2388,7 @@ /* set up index table */ hypre_BoxSetExtents(index_box, imin, imax); hypre_BoxGetSize(index_box, loop_size); - hypre_BoxLoop1Begin(ndim, loop_size, table_box, imin, stride, ii); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop1For(ii) + hypre_SerialBoxLoop1Begin(ndim, loop_size, table_box, imin, stride, ii); { if (!index_table[ii]) /* no entry- add one */ { @@ -2403,7 +2401,7 @@ index_table[ii] = entry; } } - hypre_BoxLoop1End(ii); + hypre_SerialBoxLoop1End(ii); } /* end of subset of entries */ }/* end of three loops over subsets */ @@ -2464,7 +2462,7 @@ HYPRE_Int *nentries_ptr ) { HYPRE_Int ndim = hypre_BoxManNDim(manager); - HYPRE_Int d, ii; + HYPRE_Int d; HYPRE_Int find_index_d, current_index_d; HYPRE_Int *man_indexes_d; HYPRE_Int man_index_size_d; @@ -2581,9 +2579,7 @@ hypre_BoxShiftNeg(table_box, stride); /* Want box to start at 0*/ hypre_BoxSetExtents(index_box, man_ilower, man_iupper); hypre_BoxGetSize(index_box, loop_size); - hypre_BoxLoop1Begin(ndim, loop_size, table_box, man_ilower, stride, ii); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop1For(ii) + hypre_SerialBoxLoop1Begin(ndim, loop_size, table_box, man_ilower, stride, ii); { entry = index_table[ii]; @@ -2601,7 +2597,7 @@ entry = hypre_BoxManEntryNext(entry); } } - hypre_BoxLoop1End(ii); + hypre_SerialBoxLoop1End(ii); entries = hypre_TReAlloc(entries, hypre_BoxManEntry *, nentries); diff -Nru hypre-2.11.2/src/struct_mv/communication_info.c hypre-2.13.0/src/struct_mv/communication_info.c --- hypre-2.11.2/src/struct_mv/communication_info.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/communication_info.c 2017-10-20 17:42:22.000000000 +0000 @@ -431,14 +431,13 @@ hypre_BoxSetExtents(sbox, istart, istop); start = hypre_BoxIMin(sbox); hypre_BoxGetSize(sbox, loop_size); - hypre_BoxLoop1Begin(ndim, loop_size, + + hypre_SerialBoxLoop1Begin(ndim, loop_size, stencil_box, start, stride, si); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop1For(si) { stencil_grid[si] = 1; } - hypre_BoxLoop1End(si); + hypre_SerialBoxLoop1End(si); } /*------------------------------------------------------ @@ -816,9 +815,7 @@ size = 0; start = hypre_BoxIMin(box); hypre_BoxGetSize(box, loop_size); - hypre_BoxLoop0Begin(ndim, loop_size); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { hypre_BoxLoopGetIndex(ii); for (d = 0; d < ndim; d++) @@ -835,7 +832,7 @@ } size++; } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); hypre_BoxDestroy(box); diff -Nru hypre-2.11.2/src/struct_mv/headers hypre-2.13.0/src/struct_mv/headers --- hypre-2.11.2/src/struct_mv/headers 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/headers 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,9 @@ cat > $INTERNAL_HEADER <<@ +/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/ + + #ifndef hypre_STRUCT_MV_HEADER #define hypre_STRUCT_MV_HEADER @@ -29,25 +32,56 @@ #include "HYPRE_struct_mv.h" #include "_hypre_utilities.h" +@ + + +cat >> $INTERNAL_HEADER <<@ +#if defined(HYPRE_USE_RAJA) +@ + +cat boxloop_raja.h >> $INTERNAL_HEADER + +cat >> $INTERNAL_HEADER <<@ +#elif defined(HYPRE_USE_KOKKOS) +@ + +cat boxloop_kokkos.h >> $INTERNAL_HEADER + +cat >> $INTERNAL_HEADER <<@ +#elif defined(HYPRE_USE_CUDA) +@ + +cat boxloop_cuda.h >> $INTERNAL_HEADER + +cat >> $INTERNAL_HEADER <<@ +#else +@ + +cat boxloop.h >> $INTERNAL_HEADER + +cat >> $INTERNAL_HEADER <<@ +#endif +@ + +cat >> $INTERNAL_HEADER <<@ #ifdef __cplusplus extern "C" { #endif - @ #=========================================================================== # Structures and prototypes #=========================================================================== -cat box.h >> $INTERNAL_HEADER +cat box.h >> $INTERNAL_HEADER cat assumed_part.h >> $INTERNAL_HEADER cat box_manager.h >> $INTERNAL_HEADER -cat struct_grid.h >> $INTERNAL_HEADER -cat struct_stencil.h >> $INTERNAL_HEADER -cat struct_communication.h >> $INTERNAL_HEADER -cat computation.h >> $INTERNAL_HEADER -cat struct_matrix.h >> $INTERNAL_HEADER -cat struct_vector.h >> $INTERNAL_HEADER +cat struct_grid.h >> $INTERNAL_HEADER +cat struct_stencil.h >> $INTERNAL_HEADER +cat struct_communication.h >> $INTERNAL_HEADER +cat computation.h >> $INTERNAL_HEADER +cat struct_matrix.h >> $INTERNAL_HEADER +cat struct_vector.h >> $INTERNAL_HEADER cat protos.h >> $INTERNAL_HEADER diff -Nru hypre-2.11.2/src/struct_mv/_hypre_struct_mv.h hypre-2.13.0/src/struct_mv/_hypre_struct_mv.h --- hypre-2.11.2/src/struct_mv/_hypre_struct_mv.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/_hypre_struct_mv.h 2017-10-20 17:42:22.000000000 +0000 @@ -1,3 +1,2125 @@ + +/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/ + + +#ifndef hypre_STRUCT_MV_HEADER +#define hypre_STRUCT_MV_HEADER + +#include +#include +#include + +#include "HYPRE_struct_mv.h" +#include "_hypre_utilities.h" + +#if defined(HYPRE_USE_RAJA) +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER + +extern "C++" { +#include +} +using namespace RAJA; + +typedef struct hypre_Boxloop_struct +{ + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; +} hypre_Boxloop; + +#define BLOCKSIZE 256 + +#if defined(HYPRE_MEMORY_GPU) +#include +#include + +#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__) +inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line) +{ + if (err) + { + printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line); + HYPRE_Int *p = NULL; *p = 1; + } +} + +#define hypre_exec_policy cuda_exec +#define hypre_reduce_policy cuda_reduce_atomic +#define hypre_fence() \ +cudaError err = cudaGetLastError();\ +if ( cudaSuccess != err ) {\ +printf("\n ERROR zypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \ +}\ +AxCheckError(cudaDeviceSynchronize()); + +#elif defined(HYPRE_USE_OPENMP) + #define hypre_exec_policy omp_for_exec + #define hypre_reduce_policy omp_reduce + #define hypre_fence() +#elif defined(HYPRE_USING_OPENMP_ACC) + #define hypre_exec_policy omp_parallel_for_acc + #define hypre_reduce_policy omp_acc_reduce +#else + #define hypre_exec_policy seq_exec + #define hypre_reduce_policy seq_reduce + #define hypre_fence() +#endif + +#define zypre_BoxLoopIncK(k,box,i) \ +{ \ + HYPRE_Int idx = idx_local; \ + local_idx = idx % box.lsize0; \ + idx = idx / box.lsize0; \ + i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + local_idx = idx % box.lsize1; \ + idx = idx / box.lsize1; \ + i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + local_idx = idx % box.lsize2; \ + idx = idx / box.lsize2; \ + i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); \ +} + + +#define zypre_BoxLoopCUDAInit(ndim,loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int i = 0;i < ndim;i ++) \ + hypre__tot *= loop_size[i]; + + +#define zypre_BoxLoopCUDADeclare() \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; + +#define zypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { + + +#define zypre_newBoxLoop0End() \ + }); \ + hypre_fence(); \ +} + +#define zypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0]-dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1]-dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2]-dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +#define zypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare(); \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + zypre_BoxLoopIncK(1,databox1,i1); + + +#define zypre_newBoxLoop1End(i1) \ + }); \ + hypre_fence();\ +} + +#define zypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + + + +#define zypre_newBoxLoop2End(i1, i2) \ + }); \ + hypre_fence();\ +} + +#define zypre_newBoxLoop3Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ + { \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + + +#define zypre_newBoxLoop3End(i1, i2, i3) \ + }); \ + hypre_fence(); \ +} + +#define zypre_newBoxLoop4Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + zypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1); \ + +#define zypre_newBoxLoop4End(i1, i2, i3, i4) \ + }); \ + hypre_fence(); \ +} + +#define MAX_BLOCK BLOCKSIZE + +extern "C++" { +#if defined(HYPRE_MEMORY_GPU) +template +class ReduceMult +{ +public: + /*! + * \brief Constructor takes initial reduction value (default constructor + * is disabled). + * + * Note: Constructor only executes on the host. + */ + explicit ReduceMult(T init_val) + { + m_is_copy_host = false; + m_myID = getCudaReductionId(); + getCudaReductionTallyBlock(m_myID, + (void **)&m_tally_host, + (void **)&m_tally_device); + m_tally_host->tally = init_val; + } + + /*! + * \brief Initialize shared memory on device, request shared memory on host. + * + * Copy constructor executes on both host and device. + * On host requests dynamic shared memory and gets offset into dynamic + * shared memory if in forall. + * On device initializes dynamic shared memory to appropriate value. + */ + RAJA_HOST_DEVICE + ReduceMult(const ReduceMult &other) + { + *this = other; +#if defined(__CUDA_ARCH__) + m_is_copy_device = true; + m_finish_reduction = !other.m_is_copy_device; + extern __shared__ unsigned char sd_block[]; + T *sd = reinterpret_cast(&sd_block[m_smem_offset]); + + HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + // initialize shared memory + T val = static_cast(0); + for (HYPRE_Int i = BLOCKSIZE / 2; i > 0; i /= 2) { + // this descends all the way to 1 + if (threadId < i) { + sd[threadId + i] = val; + } + } + if (threadId < 1) { + sd[threadId] = val; + } + + __syncthreads(); +#else + m_is_copy_host = true; + m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCKSIZE, sizeof(T)); +#endif + } + + /*! + * \brief Finish reduction on device and free memory on host. + * + * Destruction on host releases the device memory chunk for + * reduction id and id itself for others to use. + * Destruction on device completes the reduction. + * + * Note: destructor executes on both host and device. + */ + RAJA_HOST_DEVICE ~ReduceMult() + { +#if defined(__CUDA_ARCH__) + if (m_finish_reduction) { + extern __shared__ unsigned char sd_block[]; + T *sd = reinterpret_cast(&sd_block[m_smem_offset]); + + HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + T temp = 1; + __syncthreads(); + + for (HYPRE_Int i = BLOCKSIZE / 2; i >= WARP_SIZE; i /= 2) { + if (threadId < i) { + sd[threadId] *= sd[threadId + i]; + } + __syncthreads(); + } + + if (threadId < WARP_SIZE) { + temp = sd[threadId]; + for (HYPRE_Int i = WARP_SIZE / 2; i > 0; i /= 2) { + temp *= HIDDEN::shfl_xor(temp, i); + } + } + + // one thread adds to tally + if (threadId == 0) { + _atomicAdd(&(m_tally_device->tally), temp); + } + } +#else + if (!m_is_copy_host) { + releaseCudaReductionTallyBlock(m_myID); + releaseCudaReductionId(m_myID); + } +#endif + + + } + + /*! + * \brief Operator that returns reduced sum value. + * + * Note: accessor only executes on host. + */ + operator T() + { + beforeCudaReadTallyBlock(m_myID); + return m_tally_host->tally; + } + + /*! + * \brief Operator that returns reduced sum value. + * + * Note: accessor only executes on host. + */ + T get() { return operator T(); } + + /*! + * \brief Operator that adds value to sum. + * + * Note: only operates on device. + */ + RAJA_DEVICE ReduceMult const & + operator*=(T val) const + { + extern __shared__ unsigned char sd_block[]; + T *sd = reinterpret_cast(&sd_block[m_smem_offset]); + + HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + sd[threadId] *= val; + + return *this; + } + +private: + /*! + * \brief Default constructor is declared private and not implemented. + */ + ReduceMult(); + + /*! + * \brief Pointer to host tally block cache slot for this reduction variable. + */ + CudaReductionTallyTypeAtomic *m_tally_host = nullptr; + + /*! + * \brief Pointer to device tally block slot for this reduction variable. + */ + CudaReductionTallyTypeAtomic *m_tally_device = nullptr; + + /*! + * \brief My cuda reduction variable ID. + */ + HYPRE_Int m_myID = -1; + + /*! + * \brief Byte offset into dynamic shared memory. + */ + HYPRE_Int m_smem_offset = -1; + + /*! + * \brief If this variable is a copy or not; only original may release memory + * or perform finalization. + */ + bool m_is_copy_host = false; + bool m_is_copy_device = false; + bool m_finish_reduction = false; + + // Sanity checks for block size and template type size + static constexpr bool powerOfTwoCheck = (!(BLOCKSIZE & (BLOCKSIZE - 1))); + static constexpr bool reasonableRangeCheck = + ((BLOCKSIZE >= 32) && (BLOCKSIZE <= 1024)); + static constexpr bool sizeofcheck = + ((sizeof(T) <= sizeof(CudaReductionDummyDataType)) + && (sizeof(CudaReductionTallyType) + <= sizeof(CudaReductionDummyTallyType)) + && (sizeof(CudaReductionBlockType) + <= sizeof(CudaReductionDummyBlockType))); + static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2"); + static_assert(reasonableRangeCheck, + "Error: block sizes must be between 32 and 1024"); + static_assert(sizeofcheck, + "Error: type must be of size <= " + RAJA_STRINGIFY_MACRO(RAJA_CUDA_REDUCE_VAR_MAXSIZE)); +}; +#elif defined(HYPRE_USING_OPENMP) + template + class ReduceMult + { + using my_type = ReduceMult; + + public: + // + // Constructor takes default value (default ctor is disabled). + // + explicit ReduceMult(T init_val, T initializer = 1) + : m_parent(NULL), m_val(init_val), m_custom_init(initializer) + { + } + + // + // Copy ctor. + // + ReduceMult(const ReduceMult& other) : + m_parent(other.m_parent ? other.m_parent : &other), + m_val(other.m_custom_init), + m_custom_init(other.m_custom_init) + { + } + + // + // Destruction releases the shared memory block chunk for reduction id + // and id itself for others to use. + // + ~ReduceMult() + { + if (m_parent) { +#pragma omp critical + { + *m_parent *= m_val; + } + } + } + + // + // Operator that returns reduced sum value. + // + operator T() + { + return m_val; + } + + // + // Method that returns sum value. + // + T get() { return operator T(); } + + // + // += operator that adds value to sum for current thread. + // + const ReduceMult& operator*=(T rhs) const + { + this->m_val *= rhs; + return *this; + } + + ReduceMult& operator*=(T rhs) + { + this->m_val *= rhs; + return *this; + } + + private: + // + // Default ctor is declared private and not implemented. + // + ReduceMult(); + + const my_type * m_parent; + + mutable T m_val; + T m_custom_init; + + }; +#else + template + class ReduceMult + { + using my_type = ReduceMult; + + public: + // + // Constructor takes default value (default ctor is disabled). + // + explicit ReduceMult(T init_m_val, T initializer = 1) : + m_parent(NULL), + m_val(init_m_val), + m_custom_init(initializer) + { + } + + // + // Copy ctor. + // + ReduceMult(const ReduceMult& other) : + m_parent(other.m_parent ? other.m_parent : &other), + m_val(other.m_custom_init), + m_custom_init(other.m_custom_init) + { + } + + // + // Destruction releases the shared memory block chunk for reduction id + // and id itself for others to use. + // + ~ReduceMult() + { + if (m_parent) { + *m_parent *= m_val; + } + } + + // + // Operator that returns reduced sum value. + // + operator T() + { + return m_val; + } + + // + // Method that returns reduced sum value. + // + T get() { return operator T(); } + + // + // += operator that adds value to sum. + // + ReduceMult& operator*=(T rhs) + { + this->m_val *= rhs; + return *this; + } + + const ReduceMult& operator*=(T rhs) const + { + this->m_val *= rhs; + return *this; + } + + private: + // + // Default ctor is declared private and not implemented. + // + ReduceMult(); + + const my_type * m_parent; + + mutable T m_val; + T m_custom_init; + }; +#endif +} + + +#define zypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1,sum) \ +{ \ + HYPRE_Real sum_tmp; \ + { \ + ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0); \ + zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1,i1) \ + { + +#define zypre_newBoxLoop1ReductionEnd(i1,sum) \ + } \ + zypre_newBoxLoop1End(i1); \ + hypre_fence(); \ + sum_tmp = (HYPRE_Real)(sum); \ + } \ + sum += sum_tmp; \ +} + +#define zypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2,sum) \ +{ \ + HYPRE_Real sum_tmp; \ + { \ + ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0); \ + zypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1,i1,\ + dbox2, start2, stride2,i2) \ + { + +#define zypre_newBoxLoop2ReductionEnd(i1,i2,sum) \ + } \ + zypre_newBoxLoop2End(i1,i2); \ + hypre_fence(); \ + sum_tmp = (HYPRE_Real)(sum); \ + } \ + sum += sum_tmp; \ +} + +#define zypre_newBoxLoop1ReductionMult(ndim, loop_size, \ + dbox1, start1, stride1, i1,xp,sum) \ +{ \ + ReduceMult local_result_raja(1.0); \ + zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ + { \ + local_result_raja *= xp[i1]; \ + } \ + zypre_newBoxLoop1End(i1) \ + hypre_fence(); \ + sum *= (HYPRE_Real)(local_result_raja); \ +} + + +#define hypre_LoopBegin(size,idx) \ +{ \ + forall< hypre_exec_policy >(0, size, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { + +#define hypre_LoopEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + hypre_Boxloop databox1; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare() \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + +#define zypre_BoxBoundaryCopyEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + zypre_BoxLoopCUDAInit(ndim,loop_size); \ + hypre_Boxloop databox1,databox2; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \ + { \ + zypre_BoxLoopCUDADeclare() \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + i2 += local_idx*databox2.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + i2 += local_idx*databox2.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + i2 += local_idx*databox2.strides2; + + + +#define zypre_BoxDataExchangeEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_newBoxLoop0For() + +#define zypre_newBoxLoop1For(i1) + +#define zypre_newBoxLoop2For(i1, i2) + +#define zypre_newBoxLoop3For(i1, i2, i3) + +#define zypre_newBoxLoop4For(i1, i2, i3, i4) + +#define zypre_newBoxLoopSetOneBlock() + +#define hypre_newBoxLoopGetIndex(index) \ + index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock zypre_newBoxLoopSetOneBlock +#define hypre_BoxLoopBlock() 0 +#define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin +#define hypre_BoxLoop0For zypre_newBoxLoop0For +#define hypre_BoxLoop0End zypre_newBoxLoop0End +#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin +#define hypre_BoxLoop1For zypre_newBoxLoop1For +#define hypre_BoxLoop1End zypre_newBoxLoop1End +#define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin +#define hypre_BoxLoop2For zypre_newBoxLoop2For +#define hypre_BoxLoop2End zypre_newBoxLoop2End +#define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin +#define hypre_BoxLoop3For zypre_newBoxLoop3For +#define hypre_BoxLoop3End zypre_newBoxLoop3End +#define hypre_BoxLoop4Begin zypre_newBoxLoop4Begin +#define hypre_BoxLoop4For zypre_newBoxLoop4For +#define hypre_BoxLoop4End zypre_newBoxLoop4End + +#define hypre_newBoxLoop1ReductionBegin zypre_newBoxLoop1ReductionBegin +#define hypre_newBoxLoop1ReductionEnd zypre_newBoxLoop1ReductionEnd +#define hypre_newBoxLoop2ReductionBegin zypre_newBoxLoop2ReductionBegin +#define hypre_newBoxLoop2ReductionEnd zypre_newBoxLoop2ReductionEnd +#define hypre_newBoxLoop1ReductionMult zypre_newBoxLoop1ReductionMult +#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin +#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd +#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin +#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd +#endif +#elif defined(HYPRE_USE_KOKKOS) +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER +extern "C++" { +#include +} +#if defined( KOKKOS_HAVE_MPI ) +#include +#endif + + typedef struct hypre_Boxloop_struct + { + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; + } hypre_Boxloop; + + #if defined(HYPRE_MEMORY_GPU) + #include + #include + #define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__) + inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line) + { + if (err) + { + printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line); + } + } + #define BLOCKSIZE 256 + + #define hypre_fence() \ + cudaError err = cudaGetLastError();\ + if ( cudaSuccess != err ) {\ + printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \ + }\ + AxCheckError(cudaDeviceSynchronize()); + #elif defined(HYPRE_USE_OPENMP) + #define hypre_fence() ; + #elif defined(HYPRE_USING_OPENMP_ACC) + #define hypre_fence() + #else + #define hypre_fence(); + #endif + + #define hypre_newBoxLoopInit(ndim,loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int i = 0;i < ndim;i ++) \ + hypre__tot *= loop_size[i]; + + + #define hypre_BoxLoopIncK(k,box,i) \ + { \ + HYPRE_Int idx = idx_local; \ + local_idx = idx % box.lsize0; \ + idx = idx / box.lsize0; \ + i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + local_idx = idx % box.lsize1; \ + idx = idx / box.lsize1; \ + i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + local_idx = idx % box.lsize2; \ + idx = idx / box.lsize2; \ + i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); \ + } + + #define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0]-dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1]-dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2]-dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + + #define hypre_newBoxLoopDeclare() \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; + + #define hypre_newBoxLoop0Begin(ndim, loop_size) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { + + + #define hypre_newBoxLoop0End(i1) \ + }); \ + } + + + #define hypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size) \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); + + + #define hypre_newBoxLoop1End(i1) \ + }); \ + hypre_fence(); \ + } + + + #define hypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + + #define hypre_newBoxLoop2End(i1, i2) \ + }); \ + hypre_fence(); \ + } + + + #define hypre_newBoxLoop3Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 +databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); + + #define hypre_newBoxLoop3End(i1, i2, i3) \ + }); \ + hypre_fence(); \ + } + + #define hypre_newBoxLoop4Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ + { \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \ + Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1); \ + + + #define hypre_newBoxLoop4End(i1, i2, i3, i4) \ + }); \ + hypre_fence(); \ + } + + #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, sum) \ + { \ + HYPRE_Real sum_tmp = sum; \ + sum = 0; \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + + + + #define hypre_newBoxLoop1ReductionEnd(i1, sum) \ + },sum); \ + hypre_fence(); \ + sum += sum_tmp; \ + } + + #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, sum) \ + { \ + HYPRE_Real sum_tmp = sum; \ + sum = 0.0; \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + + + #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum) \ + },sum); \ + hypre_fence(); \ + sum +=sum_tmp; \ + } + + #define hypre_newBoxLoop1ReductionMult(ndim, loop_size, \ + dbox1, start1, stride1, i1, xp, sum) \ + { \ + HYPRE_Real sum_tmp = sum; \ + sum = 1.0; \ + hypre_newBoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + sum *= xp[i1]; \ + },sum); \ + hypre_fence(); \ + sum *=sum_tmp; \ +} + + +#define hypre_LoopBegin(size,idx) \ +{ \ + Kokkos::parallel_for(size, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_LoopEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + +#define hypre_BoxBoundaryCopyEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1,databox2; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + i2 += local_idx*databox2.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + i2 += local_idx*databox2.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + i2 += local_idx*databox2.strides2; + + + +#define hypre_BoxDataExchangeEnd() \ + }); \ + hypre_fence(); \ +} + +#define zypre_newBoxLoop0For() + +#define zypre_newBoxLoop1For(i1) + +#define zypre_newBoxLoop2For(i1, i2) + +#define zypre_newBoxLoop3For(i1, i2, i3) + +#define zypre_newBoxLoop4For(i1, i2, i3, i4) + +#define hypre_newBoxLoopSetOneBlock() {} + +#define hypre_newBoxLoopGetIndex(index) \ + index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock hypre_newBoxLoopSetOneBlock +#define hypre_BoxLoopBlock() 0 +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0For hypre_newBoxLoop0For +#define hypre_BoxLoop0End hypre_newBoxLoop0End +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1For hypre_newBoxLoop1For +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin +#define hypre_BoxLoop2For hypre_newBoxLoop2For +#define hypre_BoxLoop2End hypre_newBoxLoop2End +#define hypre_BoxLoop3Begin hypre_newBoxLoop3Begin +#define hypre_BoxLoop3For hypre_newBoxLoop3For +#define hypre_BoxLoop3End hypre_newBoxLoop3End +#define hypre_BoxLoop4Begin hypre_newBoxLoop4Begin +#define hypre_BoxLoop4For hypre_newBoxLoop4For +#define hypre_BoxLoop4End hypre_newBoxLoop4End + +//#define hypre_newBoxLoop1ReductionBegin hypre_newBoxLoop1ReductionBegin +//#define hypre_newBoxLoop1ReductionEnd hypre_newBoxLoop1ReductionEnd +//#define hypre_newBoxLoop2ReductionBegin hypre_newBoxLoop2ReductionBegin +//#define hypre_newBoxLoop2ReductionEnd hypre_newBoxLoop2ReductionEnd +//#define hypre_newBoxLoop1ReductionMult hypre_newBoxLoop1ReductionMult +//#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin +//#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd +//#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin +//#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd + +#endif +#elif defined(HYPRE_USE_CUDA) +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER + +#include +#include + +struct cuda_traversal {HYPRE_Int cuda;}; +struct omp_traversal {HYPRE_Int omp;}; +#define hypre_exec_policy cuda_traversal() +#define HYPER_LAMBDA [=] __device__ + +typedef struct hypre_Boxloop_struct +{ + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; +} hypre_Boxloop; + +#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__) +inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line) +{ + if (err) + { + printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line); + } +} +#define BLOCKSIZE 128 + +#define hypre_fence() \ + cudaError err = cudaGetLastError(); \ +if ( cudaSuccess != err )\ +{\ + printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__);\ +} \ +AxCheckError(cudaDeviceSynchronize()); + +extern "C++" { +template +__global__ void forall_kernel(LOOP_BODY loop_body, HYPRE_Int length) +{ + HYPRE_Int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < length) + loop_body(idx); +} + +template +void BoxLoopforall (cuda_traversal, HYPRE_Int length, LOOP_BODY loop_body) +{ + size_t const blockSize = 128; + size_t gridSize = (length + blockSize - 1) / blockSize; + if (gridSize == 0) gridSize = 1; + + //hypre_printf("length= %d, blocksize = %d, gridsize = %d\n",length,blockSize,gridSize); + forall_kernel<<>>(loop_body,length); +} + +template +void BoxLoopforall (omp_traversal, HYPRE_Int length, LOOP_BODY loop_body) +{ + +#pragma omp parallel for schedule(static) + for (HYPRE_Int idx = 0;idx < length;idx++) + loop_body(idx); +} + +#define zypre_BoxLoopIncK(k,box,i) \ +{ \ +HYPRE_Int idx = idx_local; \ +local_idx = idx % box.lsize0; \ +idx = idx / box.lsize0; \ +i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k; \ +hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ +local_idx = idx % box.lsize1; \ +idx = idx / box.lsize1; \ +i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k; \ +hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ +local_idx = idx % box.lsize2; \ +idx = idx / box.lsize2; \ +i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k; \ +hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); \ +} + + +template +__global__ void reduction_mult (T * a, T * b, HYPRE_Int hypre__tot, + hypre_Boxloop box1) +{ + HYPRE_Int id = (blockIdx.x * blockDim.x) + threadIdx.x; + HYPRE_Int local_idx; + HYPRE_Int idx_local = id; + HYPRE_Int hypre_boxD1 = 1; + HYPRE_Int i1 = 0; + //// reducted output + __shared__ T shared_cache [BLOCKSIZE]; + T sum = 1; + local_idx = idx_local % box1.lsize0; + idx_local = idx_local / box1.lsize0; + i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1; + hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1); + local_idx = idx_local % box1.lsize1; + idx_local = idx_local / box1.lsize1; + i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1; + hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1); + local_idx = idx_local % box1.lsize2; + idx_local = idx_local / box1.lsize2; + i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1; + hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1); + if (id < hypre__tot) + sum = a[i1]; + *(shared_cache + threadIdx.x) = sum; + + __syncthreads(); + + ///////// sum of internal cache + + HYPRE_Int i; + + for (i=(BLOCKSIZE /2); i>0 ; i= i/2){ + if (threadIdx.x < i){ + *(shared_cache + threadIdx.x) *= *(shared_cache + threadIdx.x + i); + } + __syncthreads(); + } + + if ( threadIdx.x == 0){ + *(b+ blockIdx.x) = shared_cache[0]; + } +} +} + +#define hypre_BoxLoopInit(ndim,loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int i = 0;i < ndim;i ++) \ + hypre__tot *= loop_size[i]; + + +#define hypre_newBoxLoopDeclare()\ + HYPRE_Int hypre__i,hypre__j,hypre__k;\ + HYPRE_Int idx_local = idx; + +#define hypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_newBoxLoop0End() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0]-dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1]-dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2]-dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +#define hypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1; \ + HYPRE_Int i1 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); + +#define hypre_newBoxLoop1End(i1) \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; \ + HYPRE_Int i1 = 0, i2 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + +#define hypre_newBoxLoop2End(i1, i2) \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop3Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (hypre__k*databox3.strides2 +databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + + +#define hypre_newBoxLoop3End(i1, i2,i3) \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop4Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \ + hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(); \ + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \ + HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1); \ + i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1); \ + i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1); \ + i4 += (hypre__i*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1); \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1); \ + i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1); \ + i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1); \ + i4 += (hypre__j*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1); \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \ + hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1); \ + i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \ + hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1); \ + i3 += (hypre__k*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \ + hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1); \ + i4 += (hypre__k*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \ + hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1); \ + +#define hypre_newBoxLoop4End(i1, i2, i3, i4) \ + }); \ + hypre_fence(); \ +} + +#define MAX_BLOCK 512 + +extern "C++" { +template +__inline__ __device__ +HYPRE_Int fake_shfl_down(T val, HYPRE_Int offset, HYPRE_Int width=32) { + static __shared__ T shared[MAX_BLOCK]; + HYPRE_Int lane=threadIdx.x%32; + + shared[threadIdx.x]=val; + __syncthreads(); + + val = (lane+offset +__inline__ __device__ +HYPRE_Real warpReduceSum (T val) { + for (HYPRE_Int offset = warpSize/2; offset > 0; offset /= 2) + val += __shfl_down(val,offset); + return val; +} + + +template +__inline__ __device__ +HYPRE_Real blockReduceSum(T val) { + static __shared__ T shared[32]; + HYPRE_Int lane=threadIdx.x%warpSize; + HYPRE_Int wid=threadIdx.x/warpSize; + val=warpReduceSum(val); + + //write reduced value to shared memory + if(lane==0) shared[wid]=val; + __syncthreads(); + + //ensure we only grab a value from shared memory if that warp existed + val = (threadIdx.x(val); + + return val; +} + +template +__global__ void hypre_device_reduce_stable_kernel(T*a, T*b, T* out, HYPRE_Int N, + hypre_Boxloop box1,hypre_Boxloop box2) { + HYPRE_Int local_idx; + HYPRE_Int idx_local; + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; + HYPRE_Int i1 = 0, i2 = 0; + T sum=T(0); + HYPRE_Int i; + + for(i=blockIdx.x*blockDim.x+threadIdx.x;i(sum); + if(threadIdx.x==0) + out[blockIdx.x]=sum; +} + +template +__global__ void hypre_device_reduce_stable_kernel2(T *in, T* out, HYPRE_Int N) { + T sum=T(0); + for(HYPRE_Int i=blockIdx.x*blockDim.x+threadIdx.x;i(sum); + if(threadIdx.x==0) + out[blockIdx.x]=sum; +} + +template +void hypre_device_reduce_stable(T*a,T*b, T* out, HYPRE_Int N, + hypre_Boxloop box1,hypre_Boxloop box2) { + HYPRE_Int threads=512; + HYPRE_Int blocks=min((N+threads-1)/threads,1024); + + hypre_device_reduce_stable_kernel<<>>(a,b,out,N,box1,box2); + hypre_device_reduce_stable_kernel2<<<1,1024>>>(out,out,blocks); +} + +} + +extern "C++" { +template +__global__ void hypre_device_reduction_kernel(HYPRE_Real* out, + HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2, + LOOP_BODY loop_body) +{ + HYPRE_Int local_idx; + HYPRE_Int idx_local; + HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1; + HYPRE_Int i1 = 0, i2 = 0; + HYPRE_Real sum = HYPRE_Real(0); + HYPRE_Int i; + + for(i=blockIdx.x*blockDim.x+threadIdx.x;i(sum); + if(threadIdx.x==0) + out[blockIdx.x]=sum; +} + +template +void hypre_device_reduction (HYPRE_Real* out, + HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2, + LOOP_BODY loop_body) +{ + HYPRE_Int threads=512; + HYPRE_Int blocks=min((N+threads-1)/threads,1024); + + hypre_device_reduction_kernel<<>>(out,N,box1,box2,loop_body); + hypre_device_reduce_stable_kernel2<<<1,1024>>>(out,out,blocks); + +} +} + +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, sum) \ +{ \ + HYPRE_Real sum_old = sum; \ + sum = 0.0; \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + HYPRE_Real *d_c; \ + cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real)); \ + hypre_device_reduction(d_c,hypre__tot,databox1,databox1,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \ + { + +#define hypre_newBoxLoop1ReductionEnd(i1, sum) \ + return sum; \ + }); \ + cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost); \ + sum += sum_old; \ + cudaFree(d_c); \ +} + +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2,sum) \ +{ \ + HYPRE_Real sum_old = sum; \ + sum = 0.0; \ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \ + HYPRE_Real *d_c; \ + cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real)); \ + hypre_device_reduction(d_c,hypre__tot,databox1,databox2,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \ + { + +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum) \ + return sum; \ + }); \ + cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost); \ + sum += sum_old; \ + cudaFree(d_c); \ +} + + + +#define hypre_newBoxLoop1ReductionMult(ndim, loop_size, \ + dbox1, start1, stride1, i1,xp,sum) \ +{ \ + HYPRE_Real sum_old = sum;\ + sum = 1.0;\ + hypre_BoxLoopInit(ndim,loop_size); \ + hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \ + HYPRE_Int n_blocks = (hypre__tot+BLOCKSIZE-1)/BLOCKSIZE; \ + HYPRE_Real *d_b; \ + HYPRE_Real * b = new HYPRE_Real[n_blocks]; \ + cudaMalloc((void**) &d_b, n_blocks * sizeof(HYPRE_Real)); \ + reduction_mult<<< n_blocks ,BLOCKSIZE>>>(xp,d_b,hypre__tot,databox1); \ + hypre_fence(); \ + for (HYPRE_Int j = 0 ; j< n_blocks ; ++j){ \ + sum *= b[j]; \ + } \ + delete [] b; \ + sum *=sum_old;\ +} + +#define hypre_LoopBegin(size,idx) \ +{ \ + BoxLoopforall(hypre_exec_policy,size,HYPER_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_LoopEnd() \ + }); \ + hypre_fence();\ +} + +#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += hypre__i*databox1.strides0; \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += hypre__j*databox1.strides1; \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += hypre__k*databox1.strides2; \ + +#define hypre_BoxBoundaryCopyEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1,databox2; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + for (HYPRE_Int d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare() \ + HYPRE_Int i1 = 0, i2 = 0; \ + hypre__i = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += hypre__i*databox1.strides0; \ + i2 += hypre__i*databox2.strides0; \ + hypre__j = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += hypre__j*databox1.strides1; \ + i2 += hypre__j*databox2.strides1; \ + hypre__k = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += hypre__k*databox1.strides2; \ + i2 += hypre__k*databox2.strides2; + + +#define hypre_BoxDataExchangeEnd() \ + }); \ + hypre_fence(); \ +} + +#define hypre_newBoxLoop0For() + +#define hypre_newBoxLoop1For(i1) + +#define hypre_newBoxLoop2For(i1, i2) + +#define hypre_newBoxLoop3For(i1, i2, i3) + +#define hypre_newBoxLoop4For(i1, i2, i3, i4) + +#define hypre_newBoxLoopGetIndex(index) \ + index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock() ; +#define hypre_BoxLoopBlock() 0 + +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0For hypre_newBoxLoop0For +#define hypre_BoxLoop0End hypre_newBoxLoop0End +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1For hypre_newBoxLoop1For +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin +#define hypre_BoxLoop2For hypre_newBoxLoop2For +#define hypre_BoxLoop2End hypre_newBoxLoop2End +#define hypre_BoxLoop3Begin hypre_newBoxLoop3Begin +#define hypre_BoxLoop3For hypre_newBoxLoop3For +#define hypre_BoxLoop3End hypre_newBoxLoop3End +#define hypre_BoxLoop4Begin hypre_newBoxLoop4Begin +#define hypre_BoxLoop4For hypre_newBoxLoop4For +#define hypre_BoxLoop4End hypre_newBoxLoop4End +#endif +#else /*BHEADER********************************************************************** * Copyright (c) 2008, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. @@ -10,20 +2132,386 @@ * $Revision$ ***********************************************************************EHEADER*/ -#ifndef hypre_STRUCT_MV_HEADER -#define hypre_STRUCT_MV_HEADER +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_NEWBOXLOOP_HEADER +#define HYPRE_NEWBOXLOOP_HEADER + +#ifdef HYPRE_USING_OPENMP +#ifdef WIN32 +#define Pragma(x) __pragma(#x) +#else +#define Pragma(x) _Pragma(#x) +#endif +#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_SMP_SCHEDULE) +#define OMPREDUCTION() Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) +#else +#define OMP1 +#define OMPREDUCTION() ; +#endif + +typedef struct hypre_Boxloop_struct + { + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; + }hypre_Boxloop; + +#define zypre_newBoxLoop0Begin(ndim, loop_size) \ +{\ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopInit(ndim, loop_size); \ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ + {\ + zypre_BoxLoopSet();\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop0End()\ + }\ + zypre_BoxLoopInc1();\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define zypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ + { \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop1End(i1) \ + i1 += hypre__i0inc1; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + + +#define zypre_newBoxLoop2Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop2End(i1, i2)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + + +#define zypre_newBoxLoop3Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopDeclareK(3); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + zypre_BoxLoopSetK(3, i3);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop3End(i1, i2, i3)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + i3 += hypre__i0inc3;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + i3 += hypre__ikinc3[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define zypre_newBoxLoop4Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1,\ + dbox2, start2, stride2, i2,\ + dbox3, start3, stride3, i3,\ + dbox4, start4, stride4, i4)\ +{\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopDeclareK(3);\ + zypre_BoxLoopDeclareK(4);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);\ + zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);\ + OMP1\ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + zypre_BoxLoopSetK(3, i3);\ + zypre_BoxLoopSetK(4, i4);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define zypre_newBoxLoop4End(i1, i2, i3, i4)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + i3 += hypre__i0inc3;\ + i4 += hypre__i0inc4;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + i3 += hypre__ikinc3[hypre__d];\ + i4 += hypre__ikinc4[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + sum) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + OMPREDUCTION() \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { -#include -#include -#include +#define hypre_newBoxLoop1ReductionEnd(i1, sum)\ + i1 += hypre__i0inc1;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} -#include "HYPRE_struct_mv.h" -#include "_hypre_utilities.h" +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + sum) \ +{\ + HYPRE_Int i1,i2; \ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + OMPREDUCTION() \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + {\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ + {\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ + { + +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ + }\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + zypre_BoxLoopInc2();\ + }\ + }\ +} + +#define hypre_LoopBegin(size,idx) \ +{ \ + HYPRE_Int idx; \ + for (idx = 0;idx < size;idx ++) \ + { + +#define hypre_LoopEnd() \ + } \ +} + +#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) \ +{ \ + HYPRE_Int hypre__tot = 1; \ + hypre_Boxloop databox1; \ + HYPRE_Int d,idx; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + for (d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + for (idx = 0;idx < hypre__tot;idx++) \ + { \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int i1 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + + +#define hypre_BoxBoundaryCopyEnd() \ + } \ +} + +#define hypre_BoxDataExchangeBegin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + HYPRE_Int hypre__tot = 1,idx; \ + hypre_Boxloop databox1,databox2; \ + HYPRE_Int d; \ + databox1.lsize0 = loop_size[0]; \ + databox1.lsize1 = loop_size[1]; \ + databox1.lsize2 = loop_size[2]; \ + databox1.strides0 = stride1[0]; \ + databox1.strides1 = stride1[1]; \ + databox1.strides2 = stride1[2]; \ + databox2.lsize0 = loop_size[0]; \ + databox2.lsize1 = loop_size[1]; \ + databox2.lsize2 = loop_size[2]; \ + databox2.strides0 = stride2[0]; \ + databox2.strides1 = stride2[1]; \ + databox2.strides2 = stride2[2]; \ + for (d = 0;d < ndim;d ++) \ + { \ + hypre__tot *= loop_size[d]; \ + } \ + for (idx = 0;idx < hypre__tot;idx++) \ + { \ + HYPRE_Int local_idx; \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int i1 = 0, i2 = 0; \ + local_idx = idx_local % databox1.lsize0; \ + idx_local = idx_local / databox1.lsize0; \ + i1 += local_idx*databox1.strides0; \ + i2 += local_idx*databox2.strides0; \ + local_idx = idx_local % databox1.lsize1; \ + idx_local = idx_local / databox1.lsize1; \ + i1 += local_idx*databox1.strides1; \ + i2 += local_idx*databox2.strides1; \ + local_idx = idx_local % databox1.lsize2; \ + idx_local = idx_local / databox1.lsize2; \ + i1 += local_idx*databox1.strides2; \ + i2 += local_idx*databox2.strides2; + +#define hypre_BoxDataExchangeEnd() \ + } \ +} + +#define hypre_newBoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex +#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock +#define hypre_BoxLoopBlock zypre_BoxLoopBlock +#define hypre_BoxLoop0Begin zypre_BoxLoop0Begin +#define hypre_BoxLoop0For zypre_BoxLoop0For +#define hypre_BoxLoop0End zypre_BoxLoop0End +#define hypre_BoxLoop1Begin zypre_BoxLoop1Begin +#define hypre_BoxLoop1For zypre_BoxLoop1For +#define hypre_BoxLoop1End zypre_BoxLoop1End +#define hypre_BoxLoop2Begin zypre_BoxLoop2Begin +#define hypre_BoxLoop2For zypre_BoxLoop2For +#define hypre_BoxLoop2End zypre_BoxLoop2End +#define hypre_BoxLoop3Begin zypre_BoxLoop3Begin +#define hypre_BoxLoop3For zypre_BoxLoop3For +#define hypre_BoxLoop3End zypre_BoxLoop3End +#define hypre_BoxLoop4Begin zypre_BoxLoop4Begin +#define hypre_BoxLoop4For zypre_BoxLoop4For +#define hypre_BoxLoop4End zypre_BoxLoop4End +#define hypre_BasicBoxLoop2Begin zypre_BasicBoxLoop2Begin +#endif +#endif #ifdef __cplusplus extern "C" { #endif - /*BHEADER********************************************************************** * Copyright (c) 2008, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. @@ -196,329 +2684,155 @@ * BoxLoop macros: *--------------------------------------------------------------------------*/ -#if 0 /* set to 0 to use the new box loops */ +#ifdef HYPRE_USE_RAJA +#define hypre_Reductioninit(local_result)\ +HYPRE_Real local_result;\ +local_result = 0.0; +//ReduceSum< cuda_reduce, HYPRE_Real> local_result(0.0); +#else +#define hypre_Reductioninit(local_result)\ +HYPRE_Real local_result;\ +local_result = 0.0; +#endif -#define HYPRE_BOX_PRIVATE hypre__nx,hypre__ny,hypre__nz,hypre__i,hypre__j,hypre__k +#if defined(HYPRE_MEMORY_GPU) -#define hypre_BoxLoopDeclareS(dbox, stride, sx, sy, sz) \ -HYPRE_Int sx = (hypre_IndexX(stride));\ -HYPRE_Int sy = (hypre_IndexY(stride)*hypre_BoxSizeX(dbox));\ -HYPRE_Int sz = (hypre_IndexZ(stride)*\ - hypre_BoxSizeX(dbox)*hypre_BoxSizeY(dbox)) - -#define hypre_BoxLoopDeclareN(loop_size) \ -HYPRE_Int hypre__i, hypre__j, hypre__k;\ -HYPRE_Int hypre__nx = hypre_IndexX(loop_size);\ -HYPRE_Int hypre__ny = hypre_IndexY(loop_size);\ -HYPRE_Int hypre__nz = hypre_IndexZ(loop_size);\ -HYPRE_Int hypre__mx = hypre__nx;\ -HYPRE_Int hypre__my = hypre__ny;\ -HYPRE_Int hypre__mz = hypre__nz;\ -HYPRE_Int hypre__dir, hypre__max;\ -HYPRE_Int hypre__div, hypre__mod;\ -HYPRE_Int hypre__block, hypre__num_blocks;\ -hypre__dir = 0;\ -hypre__max = hypre__nx;\ -if (hypre__ny > hypre__max)\ +#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)\ +HYPRE_Int * indices_d;\ +HYPRE_Int indices_h[stencil_size];\ +HYPRE_Int * stencil_shape_d;\ +HYPRE_Int stencil_shape_h[size*stencil_size];\ +HYPRE_Complex * data_A = hypre_StructMatrixData(A);\ +indices_d = hypre_DeviceTAlloc(HYPRE_Int, stencil_size);\ +stencil_shape_d = hypre_DeviceTAlloc(HYPRE_Int, size*stencil_size);\ +for (HYPRE_Int ii = 0; ii < stencil_size; ii++)\ {\ - hypre__dir = 1;\ - hypre__max = hypre__ny;\ + HYPRE_Int jj = 0;\ + indices_h[ii] = hypre_StructMatrixDataIndices(A)[i][ii];\ + if (size > 1) cdir = 0;\ + stencil_shape_h[ii] = hypre_IndexD(stencil_shape[ii], cdir);\ + for (jj = 1;jj < size;jj++)\ + stencil_shape_h[jj*stencil_size+ii] = hypre_IndexD(stencil_shape[ii], jj);\ }\ -if (hypre__nz > hypre__max)\ -{\ - hypre__dir = 2;\ - hypre__max = hypre__nz;\ -}\ -hypre__num_blocks = hypre_NumThreads();\ -if (hypre__max < hypre__num_blocks)\ -{\ - hypre__num_blocks = hypre__max;\ -}\ -if (hypre__num_blocks > 0)\ -{\ - hypre__div = hypre__max / hypre__num_blocks;\ - hypre__mod = hypre__max % hypre__num_blocks;\ -} +hypre_DataCopyToData(indices_h,indices_d,HYPRE_Int,stencil_size);\ +hypre_DataCopyToData(stencil_shape_h,stencil_shape_d,HYPRE_Int,size*stencil_size);\ -#define hypre_BoxLoopSet(i, j, k) \ -i = 0;\ -j = 0;\ -k = 0;\ -hypre__nx = hypre__mx;\ -hypre__ny = hypre__my;\ -hypre__nz = hypre__mz;\ -if (hypre__num_blocks > 1)\ -{\ - if (hypre__dir == 0)\ - {\ - i = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\ - hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\ - }\ - else if (hypre__dir == 1)\ - {\ - j = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\ - hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\ - }\ - else if (hypre__dir == 2)\ - {\ - k = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\ - hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\ - }\ -} +#define hypre_StructGetMatrixBoxData(A, i, si) (data_A + indices_d[si]) -#define hypre_BoxLoopGetIndex(index) \ -index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k +#define hypre_StructGetIndexD(index,i,index_d) (index_d) -/* Use this before the For macros below to force only one block */ -#define hypre_BoxLoopSetOneBlock() hypre__num_blocks = 1 - -/* Use this to get the block iteration inside a BoxLoop */ -#define hypre_BoxLoopBlock() hypre__block - -/*-----------------------------------*/ - -#define hypre_BoxLoop0Begin(ndim, loop_size)\ -{\ - hypre_BoxLoopDeclareN(loop_size); +#define hypre_StructCleanIndexD()\ +hypre_DeviceTFree(indices_d);\ +hypre_DeviceTFree(stencil_shape_d); -#define hypre_BoxLoop0For()\ - hypre__BoxLoop0For(hypre__i, hypre__j, hypre__k) -#define hypre__BoxLoop0For(i, j, k)\ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ - {\ - hypre_BoxLoopSet(i, j, k);\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ - {\ - for (i = 0; i < hypre__nx; i++)\ - { +#define hypre_StructPreparePrint()\ +HYPRE_Int tot_size = num_values*hypre_BoxVolume(hypre_BoxArrayBox(data_space, hypre_BoxArraySize(box_array)-1));\ +data_host = hypre_CTAlloc(HYPRE_Complex, tot_size);\ +hypre_DataCopyFromData(data_host,data,HYPRE_Complex,tot_size); -#define hypre_BoxLoop0End()\ - }\ - }\ - }\ - }\ -} - -/*-----------------------------------*/ +#define hypre_StructPostPrint() hypre_TFree(data_host) -#define hypre_BoxLoop1Begin(ndim, loop_size,\ - dbox1, start1, stride1, i1)\ -{\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareN(loop_size); +#else -#define hypre_BoxLoop1For(i1)\ - hypre__BoxLoop1For(hypre__i, hypre__j, hypre__k, i1) -#define hypre__BoxLoop1For(i, j, k, i1)\ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ - {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ - {\ - for (i = 0; i < hypre__nx; i++)\ - { +#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size) +#define hypre_StructGetMatrixBoxData(A, i, si) hypre_StructMatrixBoxData(A,i,si) +#define hypre_StructGetIndexD(index,i,index_d) hypre_IndexD(index,i) +#define hypre_StructCleanIndexD() {;} +#define hypre_StructPreparePrint() data_host = data; +#define hypre_StructPostPrint() {;} -#define hypre_BoxLoop1End(i1)\ - i1 += hypre__sx1;\ - }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - }\ - }\ -} +#endif -/*-----------------------------------*/ - -#define hypre_BoxLoop2Begin(ndim,loop_size,\ - dbox1, start1, stride1, i1,\ - dbox2, start2, stride2, i2)\ +#define hypre_SerialBoxLoop0Begin(ndim, loop_size)\ {\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - HYPRE_Int hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop2For(i1, i2)\ - hypre__BoxLoop2For(hypre__i, hypre__j, hypre__k, i1, i2) -#define hypre__BoxLoop2For(i, j, k, i1, i2)\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopInit(ndim, loop_size);\ + hypre_BoxLoopSetOneBlock();\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ + zypre_BoxLoopSet();\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ {\ - for (i = 0; i < hypre__nx; i++)\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ { -#define hypre_BoxLoop2End(i1, i2)\ - i1 += hypre__sx1;\ - i2 += hypre__sx2;\ +#define hypre_SerialBoxLoop0End()\ }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - i2 += hypre__sy2 - hypre__nx*hypre__sx2;\ + zypre_BoxLoopInc1();\ + zypre_BoxLoopInc2();\ }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - i2 += hypre__sz2 - hypre__ny*hypre__sy2;\ - }\ }\ } -/*-----------------------------------*/ - -#define hypre_BoxLoop3Begin(ndim, loop_size,\ - dbox1, start1, stride1, i1,\ - dbox2, start2, stride2, i2,\ - dbox3, start3, stride3, i3)\ +#define hypre_SerialBoxLoop1Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1)\ {\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - HYPRE_Int hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\ - HYPRE_Int hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\ - hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop3For(i1, i2, i3)\ - hypre__BoxLoop3For(hypre__i, hypre__j, hypre__k, i1, i2, i3) -#define hypre__BoxLoop3For(i, j, k, i1, i2, i3)\ + HYPRE_Int i1;\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopSetOneBlock();\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\ - i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ {\ - for (i = 0; i < hypre__nx; i++)\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ { -#define hypre_BoxLoop3End(i1, i2, i3)\ - i1 += hypre__sx1;\ - i2 += hypre__sx2;\ - i3 += hypre__sx3;\ +#define hypre_SerialBoxLoop1End(i1)\ + i1 += hypre__i0inc1;\ }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - i2 += hypre__sy2 - hypre__nx*hypre__sx2;\ - i3 += hypre__sy3 - hypre__nx*hypre__sx3;\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + zypre_BoxLoopInc2();\ }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - i2 += hypre__sz2 - hypre__ny*hypre__sy2;\ - i3 += hypre__sz3 - hypre__ny*hypre__sy3;\ - }\ }\ } -/*-----------------------------------*/ - -#define hypre_BoxLoop4Begin(ndim, loop_size,\ - dbox1, start1, stride1, i1,\ - dbox2, start2, stride2, i2,\ - dbox3, start3, stride3, i3,\ - dbox4, start4, stride4, i4)\ +#define hypre_SerialBoxLoop2Begin(ndim, loop_size,\ + dbox1, start1, stride1, i1,\ + dbox2, start2, stride2, i2)\ {\ - HYPRE_Int hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\ - HYPRE_Int hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\ - HYPRE_Int hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\ - HYPRE_Int hypre__i4start = hypre_BoxIndexRank(dbox4, start4);\ - hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\ - hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\ - hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\ - hypre_BoxLoopDeclareS(dbox4, stride4, hypre__sx4, hypre__sy4, hypre__sz4);\ - hypre_BoxLoopDeclareN(loop_size); - -#define hypre_BoxLoop4For(i1, i2, i3, i4)\ - hypre__BoxLoop4For(hypre__i, hypre__j, hypre__k, i1, i2, i3, i4) -#define hypre__BoxLoop4For(i, j, k, i1, i2, i3, i4)\ + HYPRE_Int i1,i2;\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\ + zypre_BoxLoopSetOneBlock();\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ - hypre_BoxLoopSet(i, j, k);\ - i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\ - i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\ - i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\ - i4 = hypre__i4start + i*hypre__sx4 + j*hypre__sy4 + k*hypre__sz4;\ - for (k = 0; k < hypre__nz; k++)\ - {\ - for (j = 0; j < hypre__ny; j++)\ + zypre_BoxLoopSet();\ + zypre_BoxLoopSetK(1, i1);\ + zypre_BoxLoopSetK(2, i2);\ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ {\ - for (i = 0; i < hypre__nx; i++)\ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\ { -#define hypre_BoxLoop4End(i1, i2, i3, i4)\ - i1 += hypre__sx1;\ - i2 += hypre__sx2;\ - i3 += hypre__sx3;\ - i4 += hypre__sx4;\ +#define hypre_SerialBoxLoop2End(i1, i2)\ + i1 += hypre__i0inc1;\ + i2 += hypre__i0inc2;\ }\ - i1 += hypre__sy1 - hypre__nx*hypre__sx1;\ - i2 += hypre__sy2 - hypre__nx*hypre__sx2;\ - i3 += hypre__sy3 - hypre__nx*hypre__sx3;\ - i4 += hypre__sy4 - hypre__nx*hypre__sx4;\ + zypre_BoxLoopInc1();\ + i1 += hypre__ikinc1[hypre__d];\ + i2 += hypre__ikinc2[hypre__d];\ + zypre_BoxLoopInc2();\ }\ - i1 += hypre__sz1 - hypre__ny*hypre__sy1;\ - i2 += hypre__sz2 - hypre__ny*hypre__sy2;\ - i3 += hypre__sz3 - hypre__ny*hypre__sy3;\ - i4 += hypre__sz4 - hypre__ny*hypre__sy4;\ - }\ }\ } -/*-----------------------------------*/ - +#if defined (HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) +#define HYPRE_BOX_PRIVATE hypre__global_error #else - -#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE - -#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex -#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock -#define hypre_BoxLoopBlock zypre_BoxLoopBlock -#define hypre_BoxLoop0Begin zypre_BoxLoop0Begin -#define hypre_BoxLoop0For zypre_BoxLoop0For -#define hypre_BoxLoop0End zypre_BoxLoop0End -#define hypre_BoxLoop1Begin zypre_BoxLoop1Begin -#define hypre_BoxLoop1For zypre_BoxLoop1For -#define hypre_BoxLoop1End zypre_BoxLoop1End -#define hypre_BoxLoop2Begin zypre_BoxLoop2Begin -#define hypre_BoxLoop2For zypre_BoxLoop2For -#define hypre_BoxLoop2End zypre_BoxLoop2End -#define hypre_BoxLoop3Begin zypre_BoxLoop3Begin -#define hypre_BoxLoop3For zypre_BoxLoop3For -#define hypre_BoxLoop3End zypre_BoxLoop3End -#define hypre_BoxLoop4Begin zypre_BoxLoop4Begin -#define hypre_BoxLoop4For zypre_BoxLoop4For -#define hypre_BoxLoop4End zypre_BoxLoop4End - -#endif /* end if 1 */ - +#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE #endif - -/****************************************************************************** - * - * NEW BoxLoop STUFF - * - *****************************************************************************/ - -#ifndef hypre_ZBOX_HEADER -#define hypre_ZBOX_HEADER - #define ZYPRE_BOX_PRIVATE hypre__IN,hypre__JN,hypre__I,hypre__J,hypre__d,hypre__i -/*-------------------------------------------------------------------------- - * BoxLoop macros: - *--------------------------------------------------------------------------*/ - #define zypre_BoxLoopDeclare() \ HYPRE_Int hypre__tot, hypre__div, hypre__mod;\ HYPRE_Int hypre__block, hypre__num_blocks;\ @@ -655,6 +2969,7 @@ #define zypre_BoxLoop1Begin(ndim, loop_size,\ dbox1, start1, stride1, i1)\ {\ + HYPRE_Int i1;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopInit(ndim, loop_size);\ @@ -663,6 +2978,7 @@ #define zypre_BoxLoop1For(i1)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\ @@ -686,6 +3002,7 @@ dbox1, start1, stride1, i1,\ dbox2, start2, stride2, i2)\ {\ + HYPRE_Int i1,i2;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopDeclareK(2);\ @@ -696,6 +3013,7 @@ #define zypre_BoxLoop2For(i1, i2)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1,i2;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ zypre_BoxLoopSetK(2, i2);\ @@ -723,6 +3041,7 @@ dbox2, start2, stride2, i2,\ dbox3, start3, stride3, i3)\ {\ + HYPRE_Int i1,i2,i3;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopDeclareK(2);\ @@ -735,6 +3054,7 @@ #define zypre_BoxLoop3For(i1, i2, i3)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1,i2,i3;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ zypre_BoxLoopSetK(2, i2);\ @@ -766,6 +3086,7 @@ dbox3, start3, stride3, i3,\ dbox4, start4, stride4, i4)\ {\ + HYPRE_Int i1,i2,i3,i4;\ zypre_BoxLoopDeclare();\ zypre_BoxLoopDeclareK(1);\ zypre_BoxLoopDeclareK(2);\ @@ -780,6 +3101,7 @@ #define zypre_BoxLoop4For(i1, i2, i3, i4)\ for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\ {\ + HYPRE_Int i1,i2,i3,i4;\ zypre_BoxLoopSet();\ zypre_BoxLoopSetK(1, i1);\ zypre_BoxLoopSetK(2, i2);\ @@ -808,6 +3130,32 @@ /*-----------------------------------*/ +#define zypre_BasicBoxLoopInitK(k, stridek) \ +hypre__sk##k[0] = stridek[0];\ +hypre__ikinc##k[0] = 0;\ +for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\ +{\ + hypre__sk##k[hypre__d] = stridek[hypre__d];\ + hypre__ikinc##k[hypre__d] = hypre__ikinc##k[hypre__d-1] +\ + hypre__sk##k[hypre__d] - hypre__n[hypre__d-1]*hypre__sk##k[hypre__d-1];\ +}\ +hypre__i0inc##k = hypre__sk##k[0];\ +hypre__ikinc##k[hypre__ndim] = 0;\ +hypre__ikstart##k = 0 + +#define zypre_BasicBoxLoop2Begin(ndim, loop_size,\ + stride1, i1,\ + stride2, i2)\ +{\ + zypre_BoxLoopDeclare();\ + zypre_BoxLoopDeclareK(1);\ + zypre_BoxLoopDeclareK(2);\ + zypre_BoxLoopInit(ndim, loop_size);\ + zypre_BasicBoxLoopInitK(1, stride1);\ + zypre_BasicBoxLoopInitK(2, stride2); + +/*-----------------------------------*/ + #endif @@ -1469,6 +3817,9 @@ HYPRE_Complex **send_buffers; HYPRE_Complex **recv_buffers; + HYPRE_Complex **send_buffers_data; + HYPRE_Complex **recv_buffers_data; + /* set = 0, add = 1 */ HYPRE_Int action; @@ -1576,6 +3927,8 @@ #define hypre_CommHandleSendBuffers(comm_handle) (comm_handle -> send_buffers) #define hypre_CommHandleRecvBuffers(comm_handle) (comm_handle -> recv_buffers) #define hypre_CommHandleAction(comm_handle) (comm_handle -> action) +#define hypre_CommHandleSendBuffersDevice(comm_handle) (comm_handle -> send_buffers_data) +#define hypre_CommHandleRecvBuffersDevice(comm_handle) (comm_handle -> recv_buffers_data) #endif /*BHEADER********************************************************************** @@ -2028,7 +4381,6 @@ HYPRE_Int HYPRE_StructVectorGetMigrateCommPkg ( HYPRE_StructVector from_vector , HYPRE_StructVector to_vector , HYPRE_CommPkg *comm_pkg ); HYPRE_Int HYPRE_StructVectorMigrate ( HYPRE_CommPkg comm_pkg , HYPRE_StructVector from_vector , HYPRE_StructVector to_vector ); HYPRE_Int HYPRE_CommPkgDestroy ( HYPRE_CommPkg comm_pkg ); -HYPRE_Int HYPRE_StructVectorClone ( HYPRE_StructVector x, HYPRE_StructVector *y_ptr ); /* project.c */ HYPRE_Int hypre_ProjectBox ( hypre_Box *box , hypre_Index index , hypre_Index stride ); @@ -2141,7 +4493,7 @@ HYPRE_Int hypre_StructVectorAssemble ( hypre_StructVector *vector ); HYPRE_Int hypre_StructVectorCopy ( hypre_StructVector *x , hypre_StructVector *y ); HYPRE_Int hypre_StructVectorSetConstantValues ( hypre_StructVector *vector , HYPRE_Complex values ); -HYPRE_Int hypre_StructVectorSetFunctionValues ( hypre_StructVector *vector , HYPRE_Complex (*fcn )(HYPRE_Int, HYPRE_Int, HYPRE_Int)); +HYPRE_Int hypre_StructVectorSetFunctionValues ( hypre_StructVector *vector , HYPRE_Complex (*fcn )()); HYPRE_Int hypre_StructVectorClearGhostValues ( hypre_StructVector *vector ); HYPRE_Int hypre_StructVectorClearBoundGhostValues ( hypre_StructVector *vector , HYPRE_Int force ); HYPRE_Int hypre_StructVectorScaleValues ( hypre_StructVector *vector , HYPRE_Complex factor ); @@ -2152,7 +4504,6 @@ HYPRE_Int hypre_StructVectorMaxValue ( hypre_StructVector *vector , HYPRE_Real *max_value , HYPRE_Int *max_index , hypre_Index max_xyz_index ); hypre_StructVector *hypre_StructVectorClone ( hypre_StructVector *vector ); - #ifdef __cplusplus } #endif diff -Nru hypre-2.11.2/src/struct_mv/protos.h hypre-2.13.0/src/struct_mv/protos.h --- hypre-2.11.2/src/struct_mv/protos.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/protos.h 2017-10-20 17:42:22.000000000 +0000 @@ -304,4 +304,4 @@ HYPRE_Int hypre_StructVectorPrint ( const char *filename , hypre_StructVector *vector , HYPRE_Int all ); hypre_StructVector *hypre_StructVectorRead ( MPI_Comm comm , const char *filename , HYPRE_Int *num_ghost ); HYPRE_Int hypre_StructVectorMaxValue ( hypre_StructVector *vector , HYPRE_Real *max_value , HYPRE_Int *max_index , hypre_Index max_xyz_index ); - +hypre_StructVector *hypre_StructVectorClone ( hypre_StructVector *vector ); diff -Nru hypre-2.11.2/src/struct_mv/struct_axpy.c hypre-2.13.0/src/struct_mv/struct_axpy.c --- hypre-2.11.2/src/struct_mv/struct_axpy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_axpy.c 2017-10-20 17:42:22.000000000 +0000 @@ -29,9 +29,6 @@ { hypre_Box *x_data_box; hypre_Box *y_data_box; - - HYPRE_Int xi; - HYPRE_Int yi; HYPRE_Complex *xp; HYPRE_Complex *yp; @@ -59,12 +56,17 @@ yp = hypre_StructVectorBoxData(y, i); hypre_BoxGetSize(box, loop_size); - + +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR +#endif +#define HYPRE_BOX_PRIVATE_VAR xi,yi + hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, - x_data_box, start, unit_stride, xi, - y_data_box, start, unit_stride, yi); + x_data_box, start, unit_stride, xi, + y_data_box, start, unit_stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { diff -Nru hypre-2.11.2/src/struct_mv/struct_communication.c hypre-2.13.0/src/struct_mv/struct_communication.c --- hypre-2.11.2/src/struct_mv/struct_communication.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_communication.c 2017-10-20 17:42:22.000000000 +0000 @@ -19,6 +19,14 @@ FILE *file; #endif +/* This is needed to do communication in the GPU case */ +#if defined(HYPRE_MEMORY_GPU) +static HYPRE_Complex* global_recv_buffer; +static HYPRE_Complex* global_send_buffer; +static HYPRE_Int global_recv_size = 0; +static HYPRE_Int global_send_size = 0; +#endif + /* this computes a (large enough) size (in doubles) for the message prefix */ #define hypre_CommPrefixSize(ne) \ ( (((1+ne)*sizeof(HYPRE_Int) + ne*sizeof(hypre_Box))/sizeof(HYPRE_Complex)) + 1 ) @@ -82,7 +90,7 @@ HYPRE_Int *send_order; HYPRE_Int i, j, k, p, m, size, p_old, my_proc; - + /*------------------------------------------------------ *------------------------------------------------------*/ @@ -624,7 +632,7 @@ hypre_Box *box; hypre_Box *data_box; HYPRE_Int i, j; - + for (j = 0; j < num_entries; j++) { i = boxnums[j]; @@ -751,10 +759,10 @@ hypre_CommEntryTypeOffset(comm_entry) = offset; hypre_CommEntryTypeDim(comm_entry) = dim; hypre_CommEntryTypeOrder(comm_entry) = order; - + return hypre_error_flag; } - + /*-------------------------------------------------------------------------- * Initialize a non-blocking communication exchange. * @@ -781,19 +789,22 @@ HYPRE_Int num_sends = hypre_CommPkgNumSends(comm_pkg); HYPRE_Int num_recvs = hypre_CommPkgNumRecvs(comm_pkg); MPI_Comm comm = hypre_CommPkgComm(comm_pkg); - + HYPRE_Int num_requests; hypre_MPI_Request *requests; hypre_MPI_Status *status; HYPRE_Complex **send_buffers; HYPRE_Complex **recv_buffers; + HYPRE_Complex **send_buffers_data; + HYPRE_Complex **recv_buffers_data; + hypre_CommType *comm_type, *from_type, *to_type; hypre_CommEntryType *comm_entry; HYPRE_Int num_entries; HYPRE_Int *length_array; - HYPRE_Int *stride_array; + HYPRE_Int *stride_array, unitst_array[HYPRE_MAXDIM+1]; HYPRE_Int *order; HYPRE_Complex *dptr, *kptr, *lptr; @@ -801,7 +812,7 @@ HYPRE_Int i, j, d, ll; HYPRE_Int size; - + /*-------------------------------------------------------------------- * allocate requests and status *--------------------------------------------------------------------*/ @@ -828,6 +839,31 @@ } } + /* Prepare send buffers */ +#if defined(HYPRE_MEMORY_GPU) + send_buffers_data = hypre_TAlloc(HYPRE_Complex *, num_sends); + if (num_sends > 0) + { + size = hypre_CommPkgSendBufsize(comm_pkg); + if (size > global_send_size) + { + if (global_send_size > 0) + hypre_DeviceTFree(global_send_buffer); + global_send_buffer = hypre_DeviceCTAlloc(HYPRE_Complex, 5*size); + global_send_size = 5*size; + } + send_buffers_data[0] = global_send_buffer; + for (i = 1; i < num_sends; i++) + { + comm_type = hypre_CommPkgSendType(comm_pkg, i-1); + size = hypre_CommTypeBufsize(comm_type); + send_buffers_data[i] = send_buffers_data[i-1] + size; + } + } +#else + send_buffers_data = send_buffers; +#endif + /* allocate recv buffers */ recv_buffers = hypre_TAlloc(HYPRE_Complex *, num_recvs); if (num_recvs > 0) @@ -842,6 +878,31 @@ } } + /* Prepare recv buffers */ +#if defined(HYPRE_MEMORY_GPU) + recv_buffers_data = hypre_TAlloc(HYPRE_Complex *, num_recvs); + if (num_recvs > 0) + { + size = hypre_CommPkgRecvBufsize(comm_pkg); + if (size > global_recv_size) + { + if (global_recv_size > 0) + hypre_DeviceTFree(global_recv_buffer); + global_recv_buffer = hypre_DeviceCTAlloc(HYPRE_Complex, 5*size); + global_recv_size = 5*size; + } + recv_buffers_data[0] = global_recv_buffer; + for (i = 1; i < num_recvs; i++) + { + comm_type = hypre_CommPkgRecvType(comm_pkg, i-1); + size = hypre_CommTypeBufsize(comm_type); + recv_buffers_data[i] = recv_buffers_data[i-1] + size; + } + } +#else + recv_buffers_data = recv_buffers; +#endif + /*-------------------------------------------------------------------- * pack send buffers *--------------------------------------------------------------------*/ @@ -851,22 +912,9 @@ comm_type = hypre_CommPkgSendType(comm_pkg, i); num_entries = hypre_CommTypeNumEntries(comm_type); - dptr = (HYPRE_Complex *) send_buffers[i]; - + dptr = (HYPRE_Complex *) send_buffers_data[i]; if ( hypre_CommPkgFirstComm(comm_pkg) ) { - qptr = (HYPRE_Int *) send_buffers[i]; - *qptr = num_entries; - qptr ++; - memcpy(qptr, hypre_CommTypeRemBoxnums(comm_type), - num_entries*sizeof(HYPRE_Int)); - qptr += num_entries; - memcpy(qptr, hypre_CommTypeRemBoxes(comm_type), - num_entries*sizeof(hypre_Box)); - - hypre_CommTypeRemBoxnums(comm_type) = NULL; - hypre_CommTypeRemBoxes(comm_type) = NULL; - dptr += hypre_CommPrefixSize(num_entries); } @@ -876,6 +924,11 @@ length_array = hypre_CommEntryTypeLengthArray(comm_entry); stride_array = hypre_CommEntryTypeStrideArray(comm_entry); order = hypre_CommEntryTypeOrder(comm_entry); + unitst_array[0] = 1; + for (d = 1; d <= ndim; d++) + { + unitst_array[d] = unitst_array[d-1]*length_array[d-1]; + } lptr = send_data + hypre_CommEntryTypeOffset(comm_entry); for (ll = 0; ll < num_values; ll++) @@ -884,50 +937,46 @@ { kptr = lptr + order[ll]*stride_array[ndim]; +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA) /* This is based on "Idea 2" in box.h */ { - HYPRE_Int i[HYPRE_MAXDIM+1]; HYPRE_Int n[HYPRE_MAXDIM+1]; HYPRE_Int s[HYPRE_MAXDIM+1]; - HYPRE_Complex *p[HYPRE_MAXDIM+1]; - HYPRE_Int I, N; + HYPRE_Int N; /* Initialize */ N = 1; for (d = 0; d < ndim; d++) { - i[d] = 0; n[d] = length_array[d]; s[d] = stride_array[d]; - p[d] = kptr; N *= n[d]; } - i[ndim] = 0; n[ndim] = 2; s[ndim] = 0; - p[ndim] = kptr; /* Emulate ndim nested for loops */ - d = 0; - for (I = 0; I < N; I++) + hypre_BoxBoundaryCopyBegin(ndim, n, s, i, idx) { - dptr[I] = *p[0]; - - while ( (i[d]+2) > n[d] ) - { - d++; - } - i[d]++; - p[d] += s[d]; - while ( d > 0 ) - { - d--; - i[d] = 0; - p[d] = p[d+1]; - } + dptr[idx] = kptr[i]; } - dptr += N; + hypre_BoxBoundaryCopyEnd(); } +#else + hypre_BasicBoxLoop2Begin(ndim, length_array, + stride_array, ki, + unitst_array, di); +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE +#endif + hypre_BoxLoop2For(ki, di) + { + dptr[di] = kptr[ki]; + } + hypre_BoxLoop2End(ki, di); +#endif + + dptr += unitst_array[ndim]; } else { @@ -936,13 +985,47 @@ { size *= length_array[d]; } - memset(dptr, 0, size*sizeof(HYPRE_Complex)); + hypre_DeviceMemset(dptr, 0, HYPRE_Complex, size); + dptr += size; } } } } + /* Copy buffer data from Device to Host */ +#if defined(HYPRE_MEMORY_GPU) + if (num_sends > 0) + { + HYPRE_Complex *dptr_host; + size = hypre_CommPkgSendBufsize(comm_pkg); + dptr_host = (HYPRE_Complex *) send_buffers[0]; + dptr = (HYPRE_Complex *) send_buffers_data[0]; + hypre_DataCopyFromData(dptr_host, dptr, HYPRE_Complex, size); + } +#endif + + for (i = 0; i < num_sends; i++) + { + comm_type = hypre_CommPkgSendType(comm_pkg, i); + num_entries = hypre_CommTypeNumEntries(comm_type); + + dptr = (HYPRE_Complex *) send_buffers[i]; + if ( hypre_CommPkgFirstComm(comm_pkg) ) + { + qptr = (HYPRE_Int *) send_buffers[i]; + *qptr = num_entries; + qptr ++; + memcpy(qptr, hypre_CommTypeRemBoxnums(comm_type), + num_entries*sizeof(HYPRE_Int)); + qptr += num_entries; + memcpy(qptr, hypre_CommTypeRemBoxes(comm_type), + num_entries*sizeof(hypre_Box)); + hypre_CommTypeRemBoxnums(comm_type) = NULL; + hypre_CommTypeRemBoxes(comm_type) = NULL; + } + } + /*-------------------------------------------------------------------- * post receives and initiate sends *--------------------------------------------------------------------*/ @@ -1020,6 +1103,8 @@ hypre_CommHandleSendBuffers(comm_handle) = send_buffers; hypre_CommHandleRecvBuffers(comm_handle) = recv_buffers; hypre_CommHandleAction(comm_handle) = action; + hypre_CommHandleSendBuffersDevice(comm_handle) = send_buffers_data; + hypre_CommHandleRecvBuffersDevice(comm_handle) = recv_buffers_data; *comm_handle_ptr = comm_handle; @@ -1041,7 +1126,7 @@ HYPRE_Complex **send_buffers = hypre_CommHandleSendBuffers(comm_handle); HYPRE_Complex **recv_buffers = hypre_CommHandleRecvBuffers(comm_handle); HYPRE_Int action = hypre_CommHandleAction(comm_handle); - + HYPRE_Int ndim = hypre_CommPkgNDim(comm_pkg); HYPRE_Int num_values = hypre_CommPkgNumValues(comm_pkg); HYPRE_Int num_sends = hypre_CommPkgNumSends(comm_pkg); @@ -1052,7 +1137,7 @@ HYPRE_Int num_entries; HYPRE_Int *length_array; - HYPRE_Int *stride_array; + HYPRE_Int *stride_array, unitst_array[HYPRE_MAXDIM+1]; HYPRE_Complex *kptr, *lptr; HYPRE_Complex *dptr; @@ -1063,6 +1148,11 @@ HYPRE_Int i, j, d, ll; +#if defined(HYPRE_MEMORY_GPU) + HYPRE_Complex **send_buffers_data = hypre_CommHandleSendBuffersDevice(comm_handle); +#endif + HYPRE_Complex **recv_buffers_data = hypre_CommHandleRecvBuffersDevice(comm_handle); + /*-------------------------------------------------------------------- * finish communications *--------------------------------------------------------------------*/ @@ -1123,12 +1213,36 @@ * unpack receive buffer data *--------------------------------------------------------------------*/ + /* Copy buffer data from Host to Device */ +#if defined(HYPRE_MEMORY_GPU) + if (num_recvs > 0) + { + HYPRE_Complex *dptr_host; + HYPRE_Int size; + size = 0; + for (i = 0; i < num_recvs; i++) + { + comm_type = hypre_CommPkgRecvType(comm_pkg, i); + num_entries = hypre_CommTypeNumEntries(comm_type); + size += hypre_CommTypeBufsize(comm_type); + if ( hypre_CommPkgFirstComm(comm_pkg) ) + { + size += hypre_CommPrefixSize(num_entries); + } + } + dptr_host = (HYPRE_Complex *) recv_buffers[0]; + dptr = (HYPRE_Complex *) recv_buffers_data[0]; + hypre_DataCopyToData(dptr_host, dptr, HYPRE_Complex, size); + } +#endif + for (i = 0; i < num_recvs; i++) { comm_type = hypre_CommPkgRecvType(comm_pkg, i); num_entries = hypre_CommTypeNumEntries(comm_type); - dptr = (HYPRE_Complex *) recv_buffers[i]; + dptr = (HYPRE_Complex *) recv_buffers_data[i]; + if ( hypre_CommPkgFirstComm(comm_pkg) ) { dptr += hypre_CommPrefixSize(num_entries); @@ -1139,6 +1253,11 @@ comm_entry = hypre_CommTypeEntry(comm_type, j); length_array = hypre_CommEntryTypeLengthArray(comm_entry); stride_array = hypre_CommEntryTypeStrideArray(comm_entry); + unitst_array[0] = 1; + for (d = 1; d <= ndim; d++) + { + unitst_array[d] = unitst_array[d-1]*length_array[d-1]; + } lptr = hypre_CommHandleRecvData(comm_handle) + hypre_CommEntryTypeOffset(comm_entry); @@ -1146,59 +1265,60 @@ { kptr = lptr + ll*stride_array[ndim]; +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)|| defined(HYPRE_USE_CUDA) /* This is based on "Idea 2" in box.h */ { - HYPRE_Int i[HYPRE_MAXDIM+1]; HYPRE_Int n[HYPRE_MAXDIM+1]; HYPRE_Int s[HYPRE_MAXDIM+1]; - HYPRE_Complex *p[HYPRE_MAXDIM+1]; - HYPRE_Int I, N; + HYPRE_Int N; /* Initialize */ N = 1; for (d = 0; d < ndim; d++) { - i[d] = 0; n[d] = length_array[d]; s[d] = stride_array[d]; - p[d] = kptr; N *= n[d]; } - i[ndim] = 0; n[ndim] = 2; s[ndim] = 0; - p[ndim] = kptr; /* Emulate ndim nested for loops */ - d = 0; - for (I = 0; I < N; I++) + hypre_BoxBoundaryCopyBegin(ndim, n, s, i, idx) { if (action > 0) { - /* add the data to existing values in memory */ - *p[0] += dptr[I]; + kptr[i] += dptr[idx]; } else { - /* copy the data over existing values in memory */ - *p[0] = dptr[I]; - } - - while ( (i[d]+2) > n[d] ) - { - d++; - } - i[d]++; - p[d] += s[d]; - while ( d > 0 ) - { - d--; - i[d] = 0; - p[d] = p[d+1]; + kptr[i] = dptr[idx]; } } - dptr += N; + hypre_BoxBoundaryCopyEnd(); + } +#else + hypre_BasicBoxLoop2Begin(ndim, length_array, + stride_array, ki, + unitst_array, di); +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE +#endif + hypre_BoxLoop2For(ki, di) + { + if (action > 0) + { + kptr[ki] += dptr[di]; + } + else + { + kptr[ki] = dptr[di]; + } } + hypre_BoxLoop2End(ki, di); +#endif + + dptr += unitst_array[ndim]; } } } @@ -1223,9 +1343,18 @@ { hypre_SharedTFree(recv_buffers[0]); } + + hypre_TFree(comm_handle); + +#if defined(HYPRE_MEMORY_GPU) hypre_TFree(send_buffers); hypre_TFree(recv_buffers); - hypre_TFree(comm_handle); + hypre_TFree(send_buffers_data); + hypre_TFree(recv_buffers_data); +#else + hypre_TFree(send_buffers); + hypre_TFree(recv_buffers); +#endif return hypre_error_flag; } @@ -1251,9 +1380,10 @@ HYPRE_Int *fr_stride_array; HYPRE_Complex *to_dp; HYPRE_Int *to_stride_array; - + HYPRE_Complex *fr_dpl, *to_dpl; + HYPRE_Int *length_array; - HYPRE_Int i, d, ll; + HYPRE_Int i, ll; HYPRE_Int *order; @@ -1285,17 +1415,20 @@ { if (order[ll] > -1) { + fr_dpl = fr_dp + (order[ll])*fr_stride_array[ndim]; + to_dpl = to_dp + ( ll )*to_stride_array[ndim]; + +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA) /* This is based on "Idea 2" in box.h */ { - HYPRE_Int i[HYPRE_MAXDIM+1]; + //HYPRE_Int i[HYPRE_MAXDIM+1]; HYPRE_Int n[HYPRE_MAXDIM+1]; HYPRE_Int fs[HYPRE_MAXDIM+1], ts[HYPRE_MAXDIM+1]; HYPRE_Complex *fp[HYPRE_MAXDIM+1], *tp[HYPRE_MAXDIM+1]; - HYPRE_Int I, N; + HYPRE_Int N,d; /* Initialize */ N = 1; - i[ndim] = 0; n[ndim] = 2; fs[ndim] = 0; ts[ndim] = 0; @@ -1303,7 +1436,6 @@ tp[ndim] = to_dp + ( ll )*to_stride_array[ndim]; for (d = 0; d < ndim; d++) { - i[d] = 0; n[d] = length_array[d]; fs[d] = fr_stride_array[d]; ts[d] = to_stride_array[d]; @@ -1313,36 +1445,43 @@ } /* Emulate ndim nested for loops */ - d = 0; - for (I = 0; I < N; I++) + hypre_BoxDataExchangeBegin(ndim, n, fs, i1, ts, i2) { if (action > 0) { /* add the data to existing values in memory */ - *tp[0] += *fp[0]; + to_dpl[i2] += fr_dpl[i1]; } else { /* copy the data over existing values in memory */ - *tp[0] = *fp[0]; - } - - while ( (i[d]+2) > n[d] ) - { - d++; - } - i[d]++; - fp[d] += fs[d]; - tp[d] += ts[d]; - while ( d > 0 ) - { - d--; - i[d] = 0; - fp[d] = fp[d+1]; - tp[d] = tp[d+1]; + to_dpl[i2] = fr_dpl[i1]; } } + hypre_BoxDataExchangeEnd(); + } +#else + hypre_BasicBoxLoop2Begin(ndim, length_array, + fr_stride_array, fi, + to_stride_array, ti); +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE +#endif + hypre_BoxLoop2For(fi, ti) + { + if (action > 0) + { + /* add the data to existing values in memory */ + to_dpl[ti] += fr_dpl[fi]; + } + else + { + /* copy the data over existing values in memory */ + to_dpl[ti] = fr_dpl[fi]; + } } + hypre_BoxLoop2End(fi, ti); +#endif } } } diff -Nru hypre-2.11.2/src/struct_mv/struct_communication.h hypre-2.13.0/src/struct_mv/struct_communication.h --- hypre-2.11.2/src/struct_mv/struct_communication.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_communication.h 2017-10-20 17:42:22.000000000 +0000 @@ -141,6 +141,9 @@ HYPRE_Complex **send_buffers; HYPRE_Complex **recv_buffers; + HYPRE_Complex **send_buffers_data; + HYPRE_Complex **recv_buffers_data; + /* set = 0, add = 1 */ HYPRE_Int action; @@ -248,5 +251,7 @@ #define hypre_CommHandleSendBuffers(comm_handle) (comm_handle -> send_buffers) #define hypre_CommHandleRecvBuffers(comm_handle) (comm_handle -> recv_buffers) #define hypre_CommHandleAction(comm_handle) (comm_handle -> action) +#define hypre_CommHandleSendBuffersDevice(comm_handle) (comm_handle -> send_buffers_data) +#define hypre_CommHandleRecvBuffersDevice(comm_handle) (comm_handle -> recv_buffers_data) #endif diff -Nru hypre-2.11.2/src/struct_mv/struct_copy.c hypre-2.13.0/src/struct_mv/struct_copy.c --- hypre-2.11.2/src/struct_mv/struct_copy.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_copy.c 2017-10-20 17:42:22.000000000 +0000 @@ -29,9 +29,6 @@ hypre_Box *x_data_box; hypre_Box *y_data_box; - HYPRE_Int xi; - HYPRE_Int yi; - HYPRE_Complex *xp; HYPRE_Complex *yp; @@ -63,7 +60,7 @@ x_data_box, start, unit_stride, xi, y_data_box, start, unit_stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -89,9 +86,6 @@ hypre_Box *x_data_box; hypre_Box *y_data_box; - HYPRE_Int xi; - HYPRE_Int yi; - HYPRE_Complex *xp; HYPRE_Complex *yp; @@ -127,7 +121,7 @@ x_data_box, start, unit_stride, xi, y_data_box, start, unit_stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { diff -Nru hypre-2.11.2/src/struct_mv/struct_grid.c hypre-2.13.0/src/struct_mv/struct_grid.c --- hypre-2.11.2/src/struct_mv/struct_grid.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_grid.c 2017-10-20 17:42:22.000000000 +0000 @@ -324,9 +324,7 @@ { p = 1; hypre_BoxGetSize(box, loop_size); - hypre_BoxLoop0Begin(ndim, loop_size); - hypre_BoxLoopSetOneBlock(); - hypre_BoxLoop0For() + hypre_SerialBoxLoop0Begin(ndim, loop_size); { pshift = pshifts[p]; hypre_BoxLoopGetIndex(pshift); @@ -345,7 +343,7 @@ p++; } } - hypre_BoxLoop0End(); + hypre_SerialBoxLoop0End(); } hypre_BoxDestroy(box); diff -Nru hypre-2.11.2/src/struct_mv/struct_innerprod.c hypre-2.13.0/src/struct_mv/struct_innerprod.c --- hypre-2.11.2/src/struct_mv/struct_innerprod.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_innerprod.c 2017-10-20 17:42:22.000000000 +0000 @@ -27,15 +27,11 @@ hypre_StructVector *y ) { HYPRE_Real final_innerprod_result; - HYPRE_Real local_result; HYPRE_Real process_result; hypre_Box *x_data_box; hypre_Box *y_data_box; - HYPRE_Int xi; - HYPRE_Int yi; - HYPRE_Complex *xp; HYPRE_Complex *yp; @@ -44,42 +40,46 @@ hypre_Index loop_size; hypre_IndexRef start; hypre_Index unit_stride; - + + HYPRE_Int ndim = hypre_StructVectorNDim(x); HYPRE_Int i; - local_result = 0.0; - process_result = 0.0; - + hypre_Reductioninit(local_result); + hypre_SetIndex(unit_stride, 1); - + boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y)); hypre_ForBoxI(i, boxes) { box = hypre_BoxArrayBox(boxes, i); start = hypre_BoxIMin(box); - + x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i); y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i); - + xp = hypre_StructVectorBoxData(x, i); yp = hypre_StructVectorBoxData(y, i); - + hypre_BoxGetSize(box, loop_size); - hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, - x_data_box, start, unit_stride, xi, - y_data_box, start, unit_stride, yi); -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) reduction(+:local_result) HYPRE_SMP_SCHEDULE +#ifdef HYPRE_BOX_PRIVATE_VAR +#undef HYPRE_BOX_PRIVATE_VAR #endif - hypre_BoxLoop2For(xi, yi) +#define HYPRE_BOX_PRIVATE_VAR xi,yi +#ifdef HYPRE_BOX_REDUCTION +#undef HYPRE_BOX_REDUCTION +#endif +#define HYPRE_BOX_REDUCTION reduction(+:local_result) + hypre_newBoxLoop2ReductionBegin(ndim, loop_size, + x_data_box, start, unit_stride, xi, + y_data_box, start, unit_stride, yi,local_result); { - local_result += xp[xi] * hypre_conj(yp[yi]); + local_result += xp[xi] * hypre_conj(yp[yi]); } - hypre_BoxLoop2End(xi, yi); + hypre_newBoxLoop2ReductionEnd(xi, yi, local_result); } process_result = local_result; - + hypre_MPI_Allreduce(&process_result, &final_innerprod_result, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, hypre_StructVectorComm(x)); diff -Nru hypre-2.11.2/src/struct_mv/struct_io.c hypre-2.13.0/src/struct_mv/struct_io.c --- hypre-2.11.2/src/struct_mv/struct_io.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_io.c 2017-10-20 17:42:22.000000000 +0000 @@ -34,7 +34,6 @@ hypre_Box *data_box; HYPRE_Int data_box_volume; - HYPRE_Int datai; hypre_Index loop_size; hypre_IndexRef start; @@ -43,11 +42,12 @@ HYPRE_Int i, j, d; HYPRE_Complex value; - + HYPRE_Complex *data_host; /*---------------------------------------- * Print data *----------------------------------------*/ - + hypre_StructPreparePrint(); + hypre_SetIndex(stride, 1); hypre_ForBoxI(i, box_array) @@ -59,10 +59,9 @@ data_box_volume = hypre_BoxVolume(data_box); hypre_BoxGetSize(box, loop_size); - - hypre_BoxLoop1Begin(dim, loop_size, - data_box, start, stride, datai); - hypre_BoxLoop1For(datai) + + hypre_SerialBoxLoop1Begin(dim, loop_size, + data_box, start, stride, datai); { /* Print lines of the form: "%d: (%d, %d, %d; %d) %.14e\n" */ hypre_BoxLoopGetIndex(index); @@ -75,7 +74,7 @@ hypre_fprintf(file, ", %d", hypre_IndexD(start, d) + hypre_IndexD(index, d)); } - value = data[datai + j*data_box_volume]; + value = data_host[datai + j*data_box_volume]; #ifdef HYPRE_COMPLEX hypre_fprintf(file, "; %d) %.14e , %.14e\n", j, hypre_creal(value), hypre_cimag(value)); @@ -84,11 +83,13 @@ #endif } } - hypre_BoxLoop1End(datai); + hypre_SerialBoxLoop1End(datai); - data += num_values*data_box_volume; + data_host += num_values*data_box_volume; } + hypre_StructPostPrint(); + return hypre_error_flag; } @@ -112,7 +113,7 @@ hypre_Box *box; hypre_Box *data_box; - HYPRE_Int data_box_volume, datai; + HYPRE_Int data_box_volume; hypre_Index loop_size; hypre_IndexRef start; @@ -156,9 +157,8 @@ hypre_BoxGetSize(box, loop_size); - hypre_BoxLoop1Begin(dim, loop_size, - data_box, start, stride, datai); - hypre_BoxLoop1For(datai) + hypre_SerialBoxLoop1Begin(dim, loop_size, + data_box, start, stride, datai); { /* Print line of the form: "%d: (%d, %d, %d; %d) %.14e\n" */ hypre_BoxLoopGetIndex(index); @@ -177,7 +177,7 @@ hypre_fprintf(file, "; %d) %.14e\n", center_rank, value); #endif } - hypre_BoxLoop1End(datai); + hypre_SerialBoxLoop1End(datai); data += data_box_volume; } @@ -242,7 +242,6 @@ hypre_Box *data_box; HYPRE_Int data_box_volume; - HYPRE_Int datai; hypre_Index loop_size; hypre_IndexRef start; @@ -266,9 +265,8 @@ hypre_BoxGetSize(box, loop_size); - hypre_BoxLoop1Begin(dim, loop_size, - data_box, start, stride, datai); - hypre_BoxLoop1For(datai) + hypre_SerialBoxLoop1Begin(dim, loop_size, + data_box, start, stride, datai); { /* Read lines of the form: "%d: (%d, %d, %d; %d) %le\n" */ for (j = 0; j < num_values; j++) @@ -282,7 +280,7 @@ &idummy, &data[datai + j*data_box_volume]); } } - hypre_BoxLoop1End(datai); + hypre_SerialBoxLoop1End(datai); data += num_values*data_box_volume; } @@ -308,7 +306,6 @@ hypre_Box *data_box; HYPRE_Int data_box_volume, constant_stencil_size; - HYPRE_Int datai; hypre_Index loop_size; hypre_IndexRef start; @@ -348,9 +345,8 @@ if ( constant_coefficient==2 ) { - hypre_BoxLoop1Begin(dim, loop_size, - data_box, start, stride, datai); - hypre_BoxLoop1For(datai) + hypre_SerialBoxLoop1Begin(dim, loop_size, + data_box, start, stride, datai); { /* Read line of the form: "%d: (%d, %d, %d; %d) %.14e\n" */ hypre_fscanf(file, "%d: (%d", &idummy, &idummy); @@ -360,7 +356,7 @@ } hypre_fscanf(file, "; %d) %le\n", &idummy, &data[datai]); } - hypre_BoxLoop1End(datai); + hypre_SerialBoxLoop1End(datai); data += data_box_volume; } diff -Nru hypre-2.11.2/src/struct_mv/struct_matrix.c hypre-2.13.0/src/struct_mv/struct_matrix.c --- hypre-2.11.2/src/struct_mv/struct_matrix.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_matrix.c 2017-10-20 17:42:22.000000000 +0000 @@ -103,12 +103,18 @@ { if (hypre_StructMatrixDataAlloced(matrix)) { - hypre_SharedTFree(hypre_StructMatrixData(matrix)); + hypre_DeviceTFree(hypre_StructMatrixData(matrix)); } hypre_CommPkgDestroy(hypre_StructMatrixCommPkg(matrix)); hypre_ForBoxI(i, hypre_StructMatrixDataSpace(matrix)) - hypre_TFree(hypre_StructMatrixDataIndices(matrix)[i]); + { + if (hypre_StructMatrixConstantCoefficient(matrix) < 2) + hypre_TFree(hypre_StructMatrixDataIndices(matrix)[i]); + else + hypre_UMTFree(hypre_StructMatrixDataIndices(matrix)[i]); + } + hypre_TFree(hypre_StructMatrixDataIndices(matrix)); hypre_BoxArrayDestroy(hypre_StructMatrixDataSpace(matrix)); @@ -340,7 +346,7 @@ data_box = hypre_BoxArrayBox(data_space, i); data_box_volume = hypre_BoxVolume(data_box); - data_indices[i] = hypre_CTAlloc(HYPRE_Int, stencil_size); + data_indices[i] = hypre_UMCTAlloc(HYPRE_Int, stencil_size); /* set pointers for "stored" coefficients */ for (j = 0; j < stencil_size; j++) @@ -423,11 +429,17 @@ hypre_StructMatrixInitialize( hypre_StructMatrix *matrix ) { HYPRE_Complex *data; - + HYPRE_Int constant_coefficient; + constant_coefficient = hypre_StructMatrixConstantCoefficient(matrix); hypre_StructMatrixInitializeShell(matrix); - data = hypre_StructMatrixData(matrix); - data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix)); + //data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix)); + + if (constant_coefficient == 0) + data = hypre_DeviceCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix)); + else + data = hypre_UMCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix)); + hypre_StructMatrixInitializeData(matrix, data); hypre_StructMatrixDataAlloced(matrix) = 1; @@ -1007,7 +1019,6 @@ hypre_Box *data_box; hypre_IndexRef data_start; hypre_Index data_stride; - HYPRE_Int datai; HYPRE_Complex *datap; hypre_Index loop_size; @@ -1074,7 +1085,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size, data_box,data_start,data_stride,datai); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(datai) { @@ -1129,7 +1140,7 @@ hypre_IndexRef start; hypre_Index stride; HYPRE_Complex *datap; - HYPRE_Int i, j, ei, datai; + HYPRE_Int i, j, ei; HYPRE_Int num_entries; /* End - variables for ghost layer identity code below */ @@ -1206,7 +1217,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size, data_box, start, stride, datai); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(datai) { @@ -1403,7 +1414,6 @@ HYPRE_Int ndim = hypre_StructMatrixNDim(matrix); hypre_Box *m_data_box; - HYPRE_Int mi; HYPRE_Complex *mp; hypre_StructStencil *stencil; @@ -1451,7 +1461,7 @@ hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size, m_data_box, start, unit_stride, mi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(mi) { @@ -1832,7 +1842,7 @@ hypre_StructStencil *stencil; hypre_BoxArray *boundary; - HYPRE_Int i, i2, ixyz, j; + HYPRE_Int i, i2, j; /*----------------------------------------------------------------------- * Set the matrix coefficients @@ -1857,7 +1867,7 @@ data_box = hypre_BoxArrayBox(data_space, i); boundary = hypre_BoxArrayCreate( 0, ndim ); hypre_GeneralBoxBoundaryIntersect(grid_box, grid, stencil_element, - boundary); + boundary); data = hypre_StructMatrixBoxData(matrix, i, j); hypre_ForBoxI(i2, boundary) { @@ -1866,7 +1876,7 @@ start = hypre_BoxIMin(tmp_box); hypre_BoxLoop1Begin(ndim, loop_size, data_box, start, stride, ixyz); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ixyz) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(ixyz) { diff -Nru hypre-2.11.2/src/struct_mv/struct_matvec.c hypre-2.13.0/src/struct_mv/struct_matvec.c --- hypre-2.11.2/src/struct_mv/struct_matvec.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_matvec.c 2017-10-20 17:42:22.000000000 +0000 @@ -109,8 +109,6 @@ hypre_BoxArrayArray *compute_box_aa; hypre_Box *y_data_box; - HYPRE_Int yi; - HYPRE_Complex *xp; HYPRE_Complex *yp; @@ -158,7 +156,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(yi) { @@ -222,7 +220,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(yi) { @@ -237,7 +235,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(yi) { @@ -321,8 +319,6 @@ HYPRE_Int xoff4; HYPRE_Int xoff5; HYPRE_Int xoff6; - HYPRE_Int Ai; - HYPRE_Int xi; hypre_BoxArray *compute_box_a; hypre_Box *compute_box; @@ -338,7 +334,6 @@ HYPRE_Int depth; hypre_Index loop_size; hypre_IndexRef start; - HYPRE_Int yi; HYPRE_Int ndim; stencil = hypre_StructMatrixStencil(A); @@ -399,7 +394,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -442,7 +437,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -481,7 +476,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -516,7 +511,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -547,7 +542,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -574,7 +569,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -597,7 +592,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -666,7 +661,7 @@ HYPRE_Int xoff5; HYPRE_Int xoff6; HYPRE_Int Ai; - HYPRE_Int xi; + hypre_BoxArray *compute_box_a; hypre_Box *compute_box; @@ -681,7 +676,6 @@ HYPRE_Int depth; hypre_Index loop_size; hypre_IndexRef start; - HYPRE_Int yi; HYPRE_Int ndim; stencil = hypre_StructMatrixStencil(A); @@ -749,7 +743,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -796,7 +790,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -838,7 +832,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -875,7 +869,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -907,7 +901,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -934,7 +928,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -956,7 +950,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1010,8 +1004,7 @@ HYPRE_Int xoff6; HYPRE_Int si_center, center_rank; hypre_Index center_index; - HYPRE_Int Ai, Ai_CC; - HYPRE_Int xi; + HYPRE_Int Ai_CC; hypre_BoxArray *compute_box_a; hypre_Box *compute_box; @@ -1027,7 +1020,6 @@ HYPRE_Int depth; hypre_Index loop_size; hypre_IndexRef start; - HYPRE_Int yi; HYPRE_Int ndim; stencil = hypre_StructMatrixStencil(A); @@ -1120,7 +1112,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1180,7 +1172,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1233,7 +1225,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1280,7 +1272,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1321,7 +1313,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1356,7 +1348,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1382,7 +1374,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi, yi) { @@ -1405,7 +1397,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { @@ -1421,7 +1413,7 @@ x_data_box, start, stride, xi, y_data_box, start, stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(Ai, xi, yi) { diff -Nru hypre-2.11.2/src/struct_mv/struct_scale.c hypre-2.13.0/src/struct_mv/struct_scale.c --- hypre-2.11.2/src/struct_mv/struct_scale.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_scale.c 2017-10-20 17:42:22.000000000 +0000 @@ -28,7 +28,6 @@ { hypre_Box *y_data_box; - HYPRE_Int yi; HYPRE_Complex *yp; hypre_BoxArray *boxes; @@ -55,7 +54,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(y), loop_size, y_data_box, start, unit_stride, yi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(yi) { diff -Nru hypre-2.11.2/src/struct_mv/struct_vector.c hypre-2.13.0/src/struct_mv/struct_vector.c --- hypre-2.11.2/src/struct_mv/struct_vector.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/struct_mv/struct_vector.c 2017-10-20 17:42:22.000000000 +0000 @@ -70,7 +70,7 @@ { if (hypre_StructVectorDataAlloced(vector)) { - hypre_SharedTFree(hypre_StructVectorData(vector)); + hypre_DeviceTFree(hypre_StructVectorData(vector)); } hypre_TFree(hypre_StructVectorDataIndices(vector)); hypre_BoxArrayDestroy(hypre_StructVectorDataSpace(vector)); @@ -186,7 +186,8 @@ hypre_StructVectorInitializeShell(vector); - data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructVectorDataSize(vector)); + data = hypre_DeviceCTAlloc(HYPRE_Complex, hypre_StructVectorDataSize(vector)); + hypre_StructVectorInitializeData(vector, data); hypre_StructVectorDataAlloced(vector) = 1; @@ -294,13 +295,11 @@ hypre_Box *data_box; hypre_IndexRef data_start; hypre_Index data_stride; - HYPRE_Int datai; HYPRE_Complex *datap; hypre_Box *dval_box; hypre_Index dval_start; hypre_Index dval_stride; - HYPRE_Int dvali; hypre_Index loop_size; @@ -364,7 +363,7 @@ data_box,data_start,data_stride,datai, dval_box,dval_start,dval_stride,dvali); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai,dvali) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(datai, dvali) { @@ -480,7 +479,6 @@ hypre_Box *data_box; hypre_IndexRef data_start; hypre_Index data_stride; - HYPRE_Int datai; HYPRE_Complex *datap; hypre_Index loop_size; @@ -539,7 +537,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, data_box,data_start,data_stride,datai); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(datai) { @@ -562,15 +560,26 @@ { HYPRE_Complex *data = hypre_StructVectorData(vector); HYPRE_Int data_size = hypre_StructVectorDataSize(vector); - HYPRE_Int i; + hypre_Index imin, imax; + hypre_Box *box; + + box = hypre_BoxCreate(1); + hypre_IndexD(imin, 0) = 1; + hypre_IndexD(imax, 0) = data_size; + hypre_BoxSetExtents(box, imin, imax); + hypre_BoxLoop1Begin(1, imax, + box, imin, imin, datai); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif - for (i = 0; i < data_size; i++) + hypre_BoxLoop1For(datai) { - data[i] = 0.0; + data[datai] = 0.0; } + hypre_BoxLoop1End(datai); + + hypre_BoxDestroy(box); return hypre_error_flag; } @@ -616,7 +625,6 @@ { hypre_Box *x_data_box; - HYPRE_Int vi; HYPRE_Complex *xp, *yp; hypre_BoxArray *boxes; @@ -649,7 +657,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size, x_data_box, start, unit_stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(vi) { @@ -670,7 +678,6 @@ { hypre_Box *v_data_box; - HYPRE_Int vi; HYPRE_Complex *vp; hypre_BoxArray *boxes; @@ -702,7 +709,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(vi) { @@ -727,7 +734,6 @@ { hypre_Box *v_data_box; - HYPRE_Int vi; HYPRE_Complex *vp; hypre_BoxArray *boxes; @@ -756,27 +762,28 @@ hypre_BoxGetSize(box, loop_size); - hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, - v_data_box, start, unit_stride, vi); i = hypre_IndexD(start, 0); j = hypre_IndexD(start, 1); k = hypre_IndexD(start, 2); + + hypre_SerialBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, + v_data_box, start, unit_stride, vi); + /* RDF: This won't work as written with threading on */ + #if 0 #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE ) HYPRE_SMP_SCHEDULE #endif #else - hypre_BoxLoopSetOneBlock(); #endif - hypre_BoxLoop1For(vi) { vp[vi] = fcn(i, j, k); i++; j++; k++; } - hypre_BoxLoop1End(vi); + hypre_SerialBoxLoop1End(vi); } return hypre_error_flag; @@ -791,7 +798,6 @@ HYPRE_Int ndim = hypre_StructVectorNDim(vector); hypre_Box *v_data_box; - HYPRE_Int vi; HYPRE_Complex *vp; hypre_BoxArray *boxes; @@ -830,7 +836,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(vi) { @@ -853,7 +859,6 @@ HYPRE_Int force ) { HYPRE_Int ndim = hypre_StructVectorNDim(vector); - HYPRE_Int vi; HYPRE_Complex *vp; hypre_BoxArray *boxes; hypre_Box *box; @@ -905,7 +910,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, stride, vi); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE ) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(vi) { @@ -930,7 +935,6 @@ HYPRE_Int hypre_StructVectorScaleValues( hypre_StructVector *vector, HYPRE_Complex factor ) { - HYPRE_Int datai; HYPRE_Complex *data; hypre_Index imin; @@ -953,7 +957,7 @@ hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, box, imin, imin, datai); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(datai) { @@ -1180,7 +1184,6 @@ A hypre_Index corresponding to max_index is put in max_xyz_index. We assume that there is only one box to deal with. */ { - HYPRE_Int datai; HYPRE_Real *data; hypre_Index imin; @@ -1211,16 +1214,16 @@ data = hypre_StructVectorBoxData(vector, i); hypre_BoxGetSize(box, loop_size); hypre_CopyIndex( hypre_BoxIMin(box), imin ); - - hypre_BoxLoop1Begin(ndim, loop_size, - box, imin, unit_stride, datai); maxindex = hypre_BoxIndexRank( box, imin ); maxvalue = data[maxindex]; hypre_SetIndex(max_xyz_index, 0); +/*FIXME: must run sequentially*/ + zypre_BoxLoop1Begin(ndim, loop_size, + box, imin, unit_stride, datai); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif - hypre_BoxLoop1For(datai) + zypre_BoxLoop1For(datai) { if ( data[datai] > maxvalue ) { @@ -1229,7 +1232,7 @@ hypre_BoxLoopGetIndex(max_xyz_index); } } - hypre_BoxLoop1End(datai); + zypre_BoxLoop1End(datai); hypre_AddIndexes(max_xyz_index, imin, ndim, max_xyz_index); } @@ -1245,7 +1248,7 @@ *--------------------------------------------------------------------------*/ hypre_StructVector * hypre_StructVectorClone( - hypre_StructVector *x) + hypre_StructVector *x) { MPI_Comm comm = hypre_StructVectorComm(x); hypre_StructGrid *grid = hypre_StructVectorGrid(x); @@ -1259,11 +1262,12 @@ hypre_StructVectorDataSize(y) = data_size; hypre_StructVectorDataSpace(y) = hypre_BoxArrayDuplicate(data_space); - hypre_StructVectorData(y) = hypre_CTAlloc(HYPRE_Complex,data_size); + hypre_StructVectorData(y) = hypre_DeviceCTAlloc(HYPRE_Complex, data_size); + hypre_StructVectorDataIndices(y) = hypre_CTAlloc(HYPRE_Int, data_space_size); for (i=0; i < data_space_size; i++) - hypre_StructVectorDataIndices(y)[i] = data_indices[i]; + hypre_StructVectorDataIndices(y)[i] = data_indices[i]; hypre_StructVectorCopy( x, y ); diff -Nru hypre-2.11.2/src/test/ams_driver.c hypre-2.13.0/src/test/ams_driver.c --- hypre-2.11.2/src/test/ams_driver.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/ams_driver.c 2017-10-20 17:42:22.000000000 +0000 @@ -128,7 +128,7 @@ hypre_MPI_Init(&argc, &argv); hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid); - + hypre_GPUInit(-1); /* Set defaults */ solver_id = 3; maxit = 100; @@ -752,7 +752,7 @@ if (zero_cond) HYPRE_ParVectorDestroy(interior_nodes); - + hypre_GPUFinalize(); hypre_MPI_Finalize(); if (HYPRE_GetError() && !myid) diff -Nru hypre-2.11.2/src/test/for_maxwell.c hypre-2.13.0/src/test/for_maxwell.c --- hypre-2.11.2/src/test/for_maxwell.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/for_maxwell.c 2017-10-20 17:42:22.000000000 +0000 @@ -1935,7 +1935,7 @@ for (j = 0; j < data.max_boxsize; j++) { values[j]= sin((HYPRE_Real)(j+1)); - values[j]= (HYPRE_Real) rand()/RAND_MAX; + values[j]= (HYPRE_Real) hypre_Rand(); values[j]= (HYPRE_Real) j; } for (part = 0; part < data.nparts; part++) diff -Nru hypre-2.11.2/src/test/ij.c hypre-2.13.0/src/test/ij.c --- hypre-2.11.2/src/test/ij.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/ij.c 2017-10-20 17:42:22.000000000 +0000 @@ -44,6 +44,9 @@ #include "multivector.h" #include "HYPRE_MatvecFunctions.h" +/* max dt */ +#define DT_INF 1.0e30 + HYPRE_Int BuildParIsoLaplacian( HYPRE_Int argc, char** argv, HYPRE_ParCSRMatrix *A_ptr ); @@ -58,7 +61,7 @@ HYPRE_Int BuildParLaplacian (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr ); HYPRE_Int BuildParSysLaplacian (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr ); -HYPRE_Int BuildParDifConv (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr ); +HYPRE_Int BuildParDifConv (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr); HYPRE_Int BuildParFromOneFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_Int num_functions , HYPRE_ParCSRMatrix *A_ptr ); HYPRE_Int BuildFuncsFromFiles (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix A , HYPRE_Int **dof_func_ptr ); HYPRE_Int BuildFuncsFromOneFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix A , HYPRE_Int **dof_func_ptr ); @@ -132,7 +135,7 @@ HYPRE_IJVector *ij_rbm; HYPRE_ParCSRMatrix parcsr_A; - HYPRE_ParVector b; + HYPRE_ParVector b = NULL; HYPRE_ParVector x; HYPRE_ParVector *interp_vecs = NULL; @@ -173,7 +176,7 @@ HYPRE_Int Q_max = 0; HYPRE_Real Q_trunc = 0; - const HYPRE_Real dt_inf = 1.e40; + const HYPRE_Real dt_inf = DT_INF; HYPRE_Real dt = dt_inf; /* parameters for BoomerAMG */ @@ -213,6 +216,7 @@ HYPRE_Int additive = -1; HYPRE_Int mult_add = -1; HYPRE_Int simple = -1; + HYPRE_Int add_last_lvl = -1; HYPRE_Int add_P_max_elmts = 0; HYPRE_Real add_trunc_factor = 0; @@ -228,6 +232,9 @@ HYPRE_Real max_row_sum = 1.; HYPRE_Int cheby_order = 2; + HYPRE_Int cheby_eig_est = 10; + HYPRE_Int cheby_variant = 0; + HYPRE_Int cheby_scale = 1; HYPRE_Real cheby_fraction = .3; /* for CGC BM Aug 25, 2006 */ @@ -323,6 +330,20 @@ /* end lobpcg */ + /* mgr options */ + HYPRE_Int mgr_bsize = 1; + HYPRE_Int mgr_nlevels = 0; + HYPRE_Int mgr_num_reserved_nodes = 0; + HYPRE_Int mgr_non_c_to_f = 1; + HYPRE_Int mgr_frelax_method = 0; + HYPRE_Int *mgr_num_cindexes = NULL; + HYPRE_Int **mgr_cindexes = NULL; + HYPRE_Int *mgr_reserved_coarse_indexes = NULL; + HYPRE_Int mgr_relax_type = 0; + HYPRE_Int mgr_num_relax_sweeps = 2; + HYPRE_Int mgr_interp_type = 2; + HYPRE_Int mgr_num_interp_sweeps = 2; + /* end mgr options */ HYPRE_Real *nongalerk_tol = NULL; HYPRE_Int nongalerk_num_tol = 0; @@ -342,6 +363,8 @@ hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs ); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); + hypre_GPUInit(-1); + //nvtxDomainHandle_t domain = nvtxDomainCreateA("Domain_A"); /* hypre_InitMemoryDebug(myid); */ @@ -848,6 +871,33 @@ pcgMode = atoi(argv[arg_index++]); } /* end lobpcg */ + /* begin mgr options*/ + else if ( strcmp(argv[arg_index], "-mgr_bsize") == 0 ) + { /* mgr block size */ + arg_index++; + mgr_bsize = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-mgr_nlevels") == 0 ) + { /* mgr number of coarsening levels */ + arg_index++; + mgr_nlevels = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-mgr_non_c_to_f") == 0 ) + { /* mgr intermediate coarse grid strategy */ + arg_index++; + mgr_non_c_to_f = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-mgr_num_reserved_nodes") == 0 ) + { /* mgr number of reserved nodes to be put on coarsest grid */ + arg_index++; + mgr_num_reserved_nodes = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-mgr_frelax_method") == 0 ) + { /* mgr F-relaxation strategy: single/ multi level */ + arg_index++; + mgr_frelax_method = atoi(argv[arg_index++]); + } + /* end mgr options */ else { arg_index++; @@ -886,7 +936,8 @@ /* defaults for BoomerAMG */ if (solver_id == 0 || solver_id == 1 || solver_id == 3 || solver_id == 5 || solver_id == 9 || solver_id == 13 || solver_id == 14 - || solver_id == 15 || solver_id == 20 || solver_id == 51 || solver_id == 61) + || solver_id == 15 || solver_id == 20 || solver_id == 51 || solver_id == 61 + || solver_id == 70 || solver_id == 71 || solver_id == 72) { strong_threshold = 0.25; trunc_factor = 0.; @@ -1207,6 +1258,21 @@ arg_index++; cheby_order = atoi(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-cheby_eig_est") == 0 ) + { + arg_index++; + cheby_eig_est = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-cheby_variant") == 0 ) + { + arg_index++; + cheby_variant = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-cheby_scale") == 0 ) + { + arg_index++; + cheby_scale = atoi(argv[arg_index++]); + } else if ( strcmp(argv[arg_index], "-cheby_fraction") == 0 ) { arg_index++; @@ -1227,6 +1293,11 @@ arg_index++; simple = atoi(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-add_end") == 0 ) + { + arg_index++; + add_last_lvl = atoi(argv[arg_index++]); + } else if ( strcmp(argv[arg_index], "-add_Pmx") == 0 ) { arg_index++; @@ -1291,289 +1362,313 @@ * Print usage info *-----------------------------------------------------------*/ - if ( (print_usage) && (myid == 0) ) + if ( print_usage ) { - hypre_printf("\n"); - hypre_printf("Usage: %s []\n", argv[0]); - hypre_printf("\n"); - hypre_printf(" -fromfile : "); - hypre_printf("matrix read from multiple files (IJ format)\n"); - hypre_printf(" -fromparcsrfile : "); - hypre_printf("matrix read from multiple files (ParCSR format)\n"); - hypre_printf(" -fromonecsrfile : "); - hypre_printf("matrix read from a single file (CSR format)\n"); - hypre_printf("\n"); - hypre_printf(" -laplacian [] : build 5pt 2D laplacian problem (default) \n"); - hypre_printf(" -sysL : build SYSTEMS laplacian 7pt operator\n"); - hypre_printf(" -9pt [] : build 9pt 2D laplacian problem\n"); - hypre_printf(" -27pt [] : build 27pt 3D laplacian problem\n"); - hypre_printf(" -difconv [] : build convection-diffusion problem\n"); - hypre_printf(" -n : total problem size \n"); - hypre_printf(" -P : processor topology\n"); - hypre_printf(" -c : diffusion coefficients\n"); - hypre_printf(" -a : convection coefficients\n"); - hypre_printf("\n"); - hypre_printf(" -exact_size : inserts immediately into ParCSR structure\n"); - hypre_printf(" -storage_low : allocates not enough storage for aux struct\n"); - hypre_printf(" -concrete_parcsr : use parcsr matrix type as concrete type\n"); - hypre_printf("\n"); - hypre_printf(" -rhsfromfile : "); - hypre_printf("rhs read from multiple files (IJ format)\n"); - hypre_printf(" -rhsfromonefile : "); - hypre_printf("rhs read from a single file (CSR format)\n"); - hypre_printf(" -rhsparcsrfile : "); - hypre_printf("rhs read from multiple files (ParCSR format)\n"); - hypre_printf(" -rhsrand : rhs is random vector\n"); - hypre_printf(" -rhsisone : rhs is vector with unit components (default)\n"); - hypre_printf(" -xisone : solution of all ones\n"); - hypre_printf(" -rhszero : rhs is zero vector\n"); - hypre_printf("\n"); - hypre_printf(" -dt : specify finite backward Euler time step\n"); - hypre_printf(" : -rhsfromfile, -rhsfromonefile, -rhsrand,\n"); - hypre_printf(" : -rhsrand, or -xisone will be ignored\n"); - hypre_printf(" -srcfromfile : "); - hypre_printf("backward Euler source read from multiple files (IJ format)\n"); - hypre_printf(" -srcfromonefile : "); - hypre_printf("backward Euler source read from a single file (IJ format)\n"); - hypre_printf(" -srcrand : "); - hypre_printf("backward Euler source is random vector with components in range 0 - 1\n"); - hypre_printf(" -srcisone : "); - hypre_printf("backward Euler source is vector with unit components (default)\n"); - hypre_printf(" -srczero : "); - hypre_printf("backward Euler source is zero-vector\n"); - hypre_printf("\n"); - hypre_printf(" -solver : solver ID\n"); - hypre_printf(" 0=AMG 1=AMG-PCG \n"); - hypre_printf(" 2=DS-PCG 3=AMG-GMRES \n"); - hypre_printf(" 4=DS-GMRES 5=AMG-CGNR \n"); - hypre_printf(" 6=DS-CGNR 7=PILUT-GMRES \n"); - hypre_printf(" 8=ParaSails-PCG 9=AMG-BiCGSTAB \n"); - hypre_printf(" 10=DS-BiCGSTAB 11=PILUT-BiCGSTAB \n"); - hypre_printf(" 12=Schwarz-PCG 13=GSMG \n"); - hypre_printf(" 14=GSMG-PCG 15=GSMG-GMRES\n"); - hypre_printf(" 18=ParaSails-GMRES\n"); - hypre_printf(" 20=Hybrid solver/ DiagScale, AMG \n"); - hypre_printf(" 43=Euclid-PCG 44=Euclid-GMRES \n"); - hypre_printf(" 45=Euclid-BICGSTAB\n"); - hypre_printf(" 50=DS-LGMRES 51=AMG-LGMRES \n"); - hypre_printf(" 60=DS-FlexGMRES 61=AMG-FlexGMRES \n"); - hypre_printf("\n"); - hypre_printf(" -cljp : CLJP coarsening \n"); - hypre_printf(" -cljp1 : CLJP coarsening, fixed random \n"); - hypre_printf(" -cgc : CGC coarsening \n"); - hypre_printf(" -cgce : CGC-E coarsening \n"); - hypre_printf(" -pmis : PMIS coarsening \n"); - hypre_printf(" -pmis1 : PMIS coarsening, fixed random \n"); - hypre_printf(" -hmis : HMIS coarsening (default)\n"); - hypre_printf(" -ruge : Ruge-Stueben coarsening (local)\n"); - hypre_printf(" -ruge1p : Ruge-Stueben coarsening 1st pass only(local)\n"); - hypre_printf(" -ruge3 : third pass on boundary\n"); - hypre_printf(" -ruge3c : third pass on boundary, keep c-points\n"); - hypre_printf(" -falgout : local Ruge_Stueben followed by CLJP\n"); - hypre_printf(" -gm : use global measures\n"); - hypre_printf("\n"); - hypre_printf(" -interptype : set interpolation type\n"); - hypre_printf(" 0=Classical modified interpolation \n"); - hypre_printf(" 1=least squares interpolation (for GSMG only) \n"); - hypre_printf(" 0=Classical modified interpolation for hyperbolic PDEs \n"); - hypre_printf(" 3=direct interpolation with separation of weights \n"); - hypre_printf(" 4=multipass interpolation \n"); - hypre_printf(" 5=multipass interpolation with separation of weights \n"); - hypre_printf(" 6=extended classical modified interpolation (default) \n"); - hypre_printf(" 7=extended (only if no common C neighbor) interpolation \n"); - hypre_printf(" 8=standard interpolation \n"); - hypre_printf(" 9=standard interpolation with separation of weights \n"); - hypre_printf(" 12=FF interpolation \n"); - hypre_printf(" 13=FF1 interpolation \n"); - - hypre_printf(" 16=use modified unknown interpolation for a system (w/unknown or hybrid approach) \n"); - hypre_printf(" 17=use non-systems interp = 6 for a system (w/unknown or hybrid approach) \n"); - hypre_printf(" 18=use non-systems interp = 8 for a system (w/unknown or hybrid approach) \n"); - hypre_printf(" 19=use non-systems interp = 0 for a system (w/unknown or hybrid approach) \n"); - + if ( myid == 0 ) + { + hypre_printf("\n"); + hypre_printf("Usage: %s []\n", argv[0]); + hypre_printf("\n"); + hypre_printf(" -fromfile : "); + hypre_printf("matrix read from multiple files (IJ format)\n"); + hypre_printf(" -fromparcsrfile : "); + hypre_printf("matrix read from multiple files (ParCSR format)\n"); + hypre_printf(" -fromonecsrfile : "); + hypre_printf("matrix read from a single file (CSR format)\n"); + hypre_printf("\n"); + hypre_printf(" -laplacian [] : build 5pt 2D laplacian problem (default) \n"); + hypre_printf(" -sysL : build SYSTEMS laplacian 7pt operator\n"); + hypre_printf(" -9pt [] : build 9pt 2D laplacian problem\n"); + hypre_printf(" -27pt [] : build 27pt 3D laplacian problem\n"); + hypre_printf(" -difconv [] : build convection-diffusion problem\n"); + hypre_printf(" -n : total problem size \n"); + hypre_printf(" -P : processor topology\n"); + hypre_printf(" -c : diffusion coefficients\n"); + hypre_printf(" -a : convection coefficients\n"); + hypre_printf(" -atype : FD scheme for convection \n"); + hypre_printf(" 0=Forward (default) 1=Backward\n"); + hypre_printf(" 2=Centered 3=Upwind\n"); + hypre_printf("\n"); + hypre_printf(" -exact_size : inserts immediately into ParCSR structure\n"); + hypre_printf(" -storage_low : allocates not enough storage for aux struct\n"); + hypre_printf(" -concrete_parcsr : use parcsr matrix type as concrete type\n"); + hypre_printf("\n"); + hypre_printf(" -rhsfromfile : "); + hypre_printf("rhs read from multiple files (IJ format)\n"); + hypre_printf(" -rhsfromonefile : "); + hypre_printf("rhs read from a single file (CSR format)\n"); + hypre_printf(" -rhsparcsrfile : "); + hypre_printf("rhs read from multiple files (ParCSR format)\n"); + hypre_printf(" -rhsrand : rhs is random vector\n"); + hypre_printf(" -rhsisone : rhs is vector with unit components (default)\n"); + hypre_printf(" -xisone : solution of all ones\n"); + hypre_printf(" -rhszero : rhs is zero vector\n"); + hypre_printf("\n"); + hypre_printf(" -dt : specify finite backward Euler time step\n"); + hypre_printf(" : -rhsfromfile, -rhsfromonefile, -rhsrand,\n"); + hypre_printf(" : -rhsrand, or -xisone will be ignored\n"); + hypre_printf(" -srcfromfile : "); + hypre_printf("backward Euler source read from multiple files (IJ format)\n"); + hypre_printf(" -srcfromonefile : "); + hypre_printf("backward Euler source read from a single file (IJ format)\n"); + hypre_printf(" -srcrand : "); + hypre_printf("backward Euler source is random vector with components in range 0 - 1\n"); + hypre_printf(" -srcisone : "); + hypre_printf("backward Euler source is vector with unit components (default)\n"); + hypre_printf(" -srczero : "); + hypre_printf("backward Euler source is zero-vector\n"); + hypre_printf("\n"); + hypre_printf(" -solver : solver ID\n"); + hypre_printf(" 0=AMG 1=AMG-PCG \n"); + hypre_printf(" 2=DS-PCG 3=AMG-GMRES \n"); + hypre_printf(" 4=DS-GMRES 5=AMG-CGNR \n"); + hypre_printf(" 6=DS-CGNR 7=PILUT-GMRES \n"); + hypre_printf(" 8=ParaSails-PCG 9=AMG-BiCGSTAB \n"); + hypre_printf(" 10=DS-BiCGSTAB 11=PILUT-BiCGSTAB \n"); + hypre_printf(" 12=Schwarz-PCG 13=GSMG \n"); + hypre_printf(" 14=GSMG-PCG 15=GSMG-GMRES\n"); + hypre_printf(" 18=ParaSails-GMRES\n"); + hypre_printf(" 20=Hybrid solver/ DiagScale, AMG \n"); + hypre_printf(" 43=Euclid-PCG 44=Euclid-GMRES \n"); + hypre_printf(" 45=Euclid-BICGSTAB\n"); + hypre_printf(" 50=DS-LGMRES 51=AMG-LGMRES \n"); + hypre_printf(" 60=DS-FlexGMRES 61=AMG-FlexGMRES \n"); + hypre_printf(" 70=MGR 71=MGR-PCG 72=MGR-FlexGMRES \n"); + hypre_printf("\n"); + hypre_printf(" -cljp : CLJP coarsening \n"); + hypre_printf(" -cljp1 : CLJP coarsening, fixed random \n"); + hypre_printf(" -cgc : CGC coarsening \n"); + hypre_printf(" -cgce : CGC-E coarsening \n"); + hypre_printf(" -pmis : PMIS coarsening \n"); + hypre_printf(" -pmis1 : PMIS coarsening, fixed random \n"); + hypre_printf(" -hmis : HMIS coarsening (default)\n"); + hypre_printf(" -ruge : Ruge-Stueben coarsening (local)\n"); + hypre_printf(" -ruge1p : Ruge-Stueben coarsening 1st pass only(local)\n"); + hypre_printf(" -ruge3 : third pass on boundary\n"); + hypre_printf(" -ruge3c : third pass on boundary, keep c-points\n"); + hypre_printf(" -falgout : local Ruge_Stueben followed by CLJP\n"); + hypre_printf(" -gm : use global measures\n"); + hypre_printf("\n"); + hypre_printf(" -interptype : set interpolation type\n"); + hypre_printf(" 0=Classical modified interpolation \n"); + hypre_printf(" 1=least squares interpolation (for GSMG only) \n"); + hypre_printf(" 0=Classical modified interpolation for hyperbolic PDEs \n"); + hypre_printf(" 3=direct interpolation with separation of weights \n"); + hypre_printf(" 4=multipass interpolation \n"); + hypre_printf(" 5=multipass interpolation with separation of weights \n"); + hypre_printf(" 6=extended classical modified interpolation (default) \n"); + hypre_printf(" 7=extended (only if no common C neighbor) interpolation \n"); + hypre_printf(" 8=standard interpolation \n"); + hypre_printf(" 9=standard interpolation with separation of weights \n"); + hypre_printf(" 12=FF interpolation \n"); + hypre_printf(" 13=FF1 interpolation \n"); + + hypre_printf(" 16=use modified unknown interpolation for a system (w/unknown or hybrid approach) \n"); + hypre_printf(" 17=use non-systems interp = 6 for a system (w/unknown or hybrid approach) \n"); + hypre_printf(" 18=use non-systems interp = 8 for a system (w/unknown or hybrid approach) \n"); + hypre_printf(" 19=use non-systems interp = 0 for a system (w/unknown or hybrid approach) \n"); + + + hypre_printf(" 10=classical block interpolation for nodal systems AMG\n"); + hypre_printf(" 11=classical block interpolation with diagonal blocks for nodal systems AMG\n"); + hypre_printf(" 20=same as 10, but don't add weak connect. to diag \n"); + hypre_printf(" 21=same as 11, but don't add weak connect. to diag \n"); + hypre_printf(" 22=classical block interpolation w/Ruge's variant for nodal systems AMG \n"); + hypre_printf(" 23=same as 22, but use row sums for diag scaling matrices,for nodal systems AMG \n"); + hypre_printf(" 24=direct block interpolation for nodal systems AMG\n"); - hypre_printf(" 10=classical block interpolation for nodal systems AMG\n"); - hypre_printf(" 11=classical block interpolation with diagonal blocks for nodal systems AMG\n"); - hypre_printf(" 20=same as 10, but don't add weak connect. to diag \n"); - hypre_printf(" 21=same as 11, but don't add weak connect. to diag \n"); - hypre_printf(" 22=classical block interpolation w/Ruge's variant for nodal systems AMG \n"); - hypre_printf(" 23=same as 22, but use row sums for diag scaling matrices,for nodal systems AMG \n"); - hypre_printf(" 24=direct block interpolation for nodal systems AMG\n"); - - hypre_printf("\n"); - hypre_printf(" -rlx : relaxation type\n"); - hypre_printf(" 0=Weighted Jacobi \n"); - hypre_printf(" 1=Gauss-Seidel (very slow!) \n"); - hypre_printf(" 3=Hybrid Gauss-Seidel \n"); - hypre_printf(" 4=Hybrid backward Gauss-Seidel \n"); - hypre_printf(" 6=Hybrid symmetric Gauss-Seidel \n"); - hypre_printf(" 8= symmetric L1-Gauss-Seidel \n"); - hypre_printf(" 13= forward L1-Gauss-Seidel \n"); - hypre_printf(" 14= backward L1-Gauss-Seidel \n"); - hypre_printf(" 15=CG \n"); - hypre_printf(" 16=Chebyshev \n"); - hypre_printf(" 17=FCF-Jacobi \n"); - hypre_printf(" 18=L1-Jacobi (may be used with -CF) \n"); - hypre_printf(" 9=Gauss elimination (use for coarsest grid only) \n"); - hypre_printf(" 99=Gauss elimination with pivoting (use for coarsest grid only) \n"); - hypre_printf(" 20= Nodal Weighted Jacobi (for systems only) \n"); - hypre_printf(" 23= Nodal Hybrid Jacobi/Gauss-Seidel (for systems only) \n"); - hypre_printf(" 26= Nodal Hybrid Symmetric Gauss-Seidel (for systems only)\n"); - hypre_printf(" 29= Nodal Gauss elimination (use for coarsest grid only) \n"); - hypre_printf(" -rlx_coarse : set relaxation type for coarsest grid\n"); - hypre_printf(" -rlx_down : set relaxation type for down cycle\n"); - hypre_printf(" -rlx_up : set relaxation type for up cycle\n"); - hypre_printf(" -cheby_order : set order (1-4) for Chebyshev poly. smoother (default is 2)\n"); - hypre_printf(" -cheby_fraction : fraction of the spectrum for Chebyshev poly. smoother (default is .3)\n"); - hypre_printf(" -nodal : nodal system type\n"); - hypre_printf(" 0 = Unknown approach \n"); - hypre_printf(" 1 = Frobenius norm \n"); - hypre_printf(" 2 = Sum of Abs.value of elements \n"); - hypre_printf(" 3 = Largest magnitude element (includes its sign) \n"); - hypre_printf(" 4 = Inf. norm \n"); - hypre_printf(" 5 = One norm (note: use with block version only) \n"); - hypre_printf(" 6 = Sum of all elements in block \n"); - hypre_printf(" -nodal_diag :how to treat diag elements\n"); - hypre_printf(" 0 = no special treatment \n"); - hypre_printf(" 1 = make diag = neg.sum of the off_diag \n"); - hypre_printf(" 2 = make diag = neg. of diag \n"); - hypre_printf(" -ns : Use sweeps on each level\n"); - hypre_printf(" (default C/F down, F/C up, F/C fine\n"); - hypre_printf(" -ns_coarse : set no. of sweeps for coarsest grid\n"); - hypre_printf(" -ns_down : set no. of sweeps for down cycle\n"); - hypre_printf(" -ns_up : set no. of sweeps for up cycle\n"); - hypre_printf("\n"); - hypre_printf(" -mu : set AMG cycles (1=V, 2=W, etc.)\n"); - hypre_printf(" -th : set AMG threshold Theta = val \n"); - hypre_printf(" -tr : set AMG interpolation truncation factor = val \n"); - hypre_printf(" -Pmx : set maximal no. of elmts per row for AMG interpolation (default: 4)\n"); - hypre_printf(" -jtr : set truncation threshold for Jacobi interpolation = val \n"); - hypre_printf(" -Ssw : set S-commpkg-switch = val \n"); - hypre_printf(" -mxrs : set AMG maximum row sum threshold for dependency weakening \n"); - hypre_printf(" -nf : set number of functions for systems AMG\n"); - hypre_printf(" -numsamp : set number of sample vectors for GSMG\n"); - - hypre_printf(" -postinterptype : invokes no. of Jacobi interpolation steps after main interpolation\n"); - hypre_printf("\n"); - hypre_printf(" -cgcitr : set maximal number of coarsening iterations for CGC\n"); - hypre_printf(" -solver_type : sets solver within Hybrid solver\n"); - hypre_printf(" : 1 PCG (default)\n"); - hypre_printf(" : 2 GMRES\n"); - hypre_printf(" : 3 BiCGSTAB\n"); - - hypre_printf(" -w : set Jacobi relax weight = val\n"); - hypre_printf(" -k : dimension Krylov space for GMRES\n"); - hypre_printf(" -aug : number of augmentation vectors for LGMRES (-k indicates total approx space size)\n"); - - hypre_printf(" -mxl : maximum number of levels (AMG, ParaSAILS)\n"); - hypre_printf(" -tol : set solver convergence tolerance = val\n"); - hypre_printf(" -atol : set solver absolute convergence tolerance = val\n"); - hypre_printf(" -max_iter : set max iterations\n"); - hypre_printf(" -mg_max_iter : set max iterations for mg solvers\n"); - hypre_printf(" -agg_nl : set number of aggressive coarsening levels (default:0)\n"); - hypre_printf(" -np : set number of paths of length 2 for aggr. coarsening\n"); - hypre_printf("\n"); - hypre_printf(" -sai_th : set ParaSAILS threshold = val \n"); - hypre_printf(" -sai_filt : set ParaSAILS filter = val \n"); - hypre_printf("\n"); - hypre_printf(" -level : set k in ILU(k) for Euclid \n"); - hypre_printf(" -bj : enable block Jacobi ILU for Euclid \n"); - hypre_printf(" -ilut : set drop tolerance for ILUT in Euclid\n"); - hypre_printf(" Note ILUT is sequential only!\n"); - hypre_printf(" -sparseA : set drop tolerance in ILU(k) for Euclid \n"); - hypre_printf(" -rowScale : enable row scaling in Euclid \n"); - hypre_printf("\n"); - hypre_printf(" -drop_tol : set threshold for dropping in PILUT\n"); - hypre_printf(" -nonzeros_to_keep : number of nonzeros in each row to keep\n"); - hypre_printf("\n"); - hypre_printf(" -iout : set output flag\n"); - hypre_printf(" 0=no output 1=matrix stats\n"); - hypre_printf(" 2=cycle stats 3=matrix & cycle stats\n"); - hypre_printf("\n"); - hypre_printf(" -dbg : set debug flag\n"); - hypre_printf(" 0=no debugging\n 1=internal timing\n 2=interpolation truncation\n 3=more detailed timing in coarsening routine\n"); - hypre_printf("\n"); - hypre_printf(" -print : print out the system\n"); - hypre_printf("\n"); - /* begin lobpcg */ + hypre_printf("\n"); + hypre_printf(" -rlx : relaxation type\n"); + hypre_printf(" 0=Weighted Jacobi \n"); + hypre_printf(" 1=Gauss-Seidel (very slow!) \n"); + hypre_printf(" 3=Hybrid Gauss-Seidel \n"); + hypre_printf(" 4=Hybrid backward Gauss-Seidel \n"); + hypre_printf(" 6=Hybrid symmetric Gauss-Seidel \n"); + hypre_printf(" 8= symmetric L1-Gauss-Seidel \n"); + hypre_printf(" 13= forward L1-Gauss-Seidel \n"); + hypre_printf(" 14= backward L1-Gauss-Seidel \n"); + hypre_printf(" 15=CG \n"); + hypre_printf(" 16=Chebyshev \n"); + hypre_printf(" 17=FCF-Jacobi \n"); + hypre_printf(" 18=L1-Jacobi (may be used with -CF) \n"); + hypre_printf(" 9=Gauss elimination (use for coarsest grid only) \n"); + hypre_printf(" 99=Gauss elimination with pivoting (use for coarsest grid only) \n"); + hypre_printf(" 20= Nodal Weighted Jacobi (for systems only) \n"); + hypre_printf(" 23= Nodal Hybrid Jacobi/Gauss-Seidel (for systems only) \n"); + hypre_printf(" 26= Nodal Hybrid Symmetric Gauss-Seidel (for systems only)\n"); + hypre_printf(" 29= Nodal Gauss elimination (use for coarsest grid only) \n"); + hypre_printf(" -rlx_coarse : set relaxation type for coarsest grid\n"); + hypre_printf(" -rlx_down : set relaxation type for down cycle\n"); + hypre_printf(" -rlx_up : set relaxation type for up cycle\n"); + hypre_printf(" -cheby_order : set order (1-4) for Chebyshev poly. smoother (default is 2)\n"); + hypre_printf(" -cheby_fraction : fraction of the spectrum for Chebyshev poly. smoother (default is .3)\n"); + hypre_printf(" -nodal : nodal system type\n"); + hypre_printf(" 0 = Unknown approach \n"); + hypre_printf(" 1 = Frobenius norm \n"); + hypre_printf(" 2 = Sum of Abs.value of elements \n"); + hypre_printf(" 3 = Largest magnitude element (includes its sign) \n"); + hypre_printf(" 4 = Inf. norm \n"); + hypre_printf(" 5 = One norm (note: use with block version only) \n"); + hypre_printf(" 6 = Sum of all elements in block \n"); + hypre_printf(" -nodal_diag :how to treat diag elements\n"); + hypre_printf(" 0 = no special treatment \n"); + hypre_printf(" 1 = make diag = neg.sum of the off_diag \n"); + hypre_printf(" 2 = make diag = neg. of diag \n"); + hypre_printf(" -ns : Use sweeps on each level\n"); + hypre_printf(" (default C/F down, F/C up, F/C fine\n"); + hypre_printf(" -ns_coarse : set no. of sweeps for coarsest grid\n"); + /*hypre_printf(" -ns_down : set no. of sweeps for down cycle\n"); + hypre_printf(" -ns_up : set no. of sweeps for up cycle\n");*/ + hypre_printf("\n"); + hypre_printf(" -mu : set AMG cycles (1=V, 2=W, etc.)\n"); + hypre_printf(" -th : set AMG threshold Theta = val \n"); + hypre_printf(" -tr : set AMG interpolation truncation factor = val \n"); + hypre_printf(" -Pmx : set maximal no. of elmts per row for AMG interpolation (default: 4)\n"); + hypre_printf(" -jtr : set truncation threshold for Jacobi interpolation = val \n"); + hypre_printf(" -Ssw : set S-commpkg-switch = val \n"); + hypre_printf(" -mxrs : set AMG maximum row sum threshold for dependency weakening \n"); + hypre_printf(" -nf : set number of functions for systems AMG\n"); + hypre_printf(" -numsamp : set number of sample vectors for GSMG\n"); - hypre_printf("LOBPCG options:\n"); - hypre_printf("\n"); - hypre_printf(" -lobpcg : run LOBPCG instead of PCG\n"); - hypre_printf("\n"); - hypre_printf(" -gen : solve generalized EVP with B = Laplacian\n"); - hypre_printf("\n"); - hypre_printf(" -con : solve constrained EVP using 'vectors.*.*'\n"); - hypre_printf(" as constraints (see -vout 1 below)\n"); - hypre_printf("\n"); - hypre_printf(" -solver none : no HYPRE preconditioner is used\n"); - hypre_printf("\n"); - hypre_printf(" -itr : maximal number of LOBPCG iterations\n"); - hypre_printf(" (default 100);\n"); - hypre_printf("\n"); - hypre_printf(" -vrand : compute eigenpairs using random\n"); - hypre_printf(" initial vectors (default 1)\n"); - hypre_printf("\n"); - hypre_printf(" -seed : use as the seed for the random\n"); - hypre_printf(" number generator(default seed is based\n"); - hypre_printf(" on the time of the run)\n"); - hypre_printf("\n"); - hypre_printf(" -vfromfile : read initial vectors from files\n"); - hypre_printf(" vectors.i.j where i is vector number\n"); - hypre_printf(" and j is processor number\n"); - hypre_printf("\n"); - hypre_printf(" -orthchk : check eigenvectors for orthonormality\n"); - hypre_printf("\n"); - hypre_printf(" -verb : verbosity level\n"); - hypre_printf(" -verb 0 : no print\n"); - hypre_printf(" -verb 1 : print initial eigenvalues and residuals,\n"); - hypre_printf(" the iteration number, the number of\n"); - hypre_printf(" non-convergent eigenpairs and final\n"); - hypre_printf(" eigenvalues and residuals (default)\n"); - hypre_printf(" -verb 2 : print eigenvalues and residuals on each\n"); - hypre_printf(" iteration\n"); - hypre_printf("\n"); - hypre_printf(" -pcgitr : maximal number of inner PCG iterations\n"); - hypre_printf(" for preconditioning (default 1);\n"); - hypre_printf(" if = 0 then the preconditioner\n"); - hypre_printf(" is applied directly\n"); - hypre_printf("\n"); - hypre_printf(" -pcgtol : residual tolerance for inner iterations\n"); - hypre_printf(" (default 0.01)\n"); - hypre_printf("\n"); - hypre_printf(" -vout : file output level\n"); - hypre_printf(" -vout 0 : no files created (default)\n"); - hypre_printf(" -vout 1 : write eigenvalues to values.txt, residuals\n"); - hypre_printf(" to residuals.txt and eigenvectors to \n"); - hypre_printf(" vectors.i.j where i is vector number\n"); - hypre_printf(" and j is processor number\n"); - hypre_printf(" -vout 2 : in addition to the above, write the\n"); - hypre_printf(" eigenvalues history (the matrix whose\n"); - hypre_printf(" i-th column contains eigenvalues at\n"); - hypre_printf(" (i+1)-th iteration) to val_hist.txt and\n"); - hypre_printf(" residuals history to res_hist.txt\n"); - hypre_printf("\nNOTE: in this test driver LOBPCG only works with solvers 1, 2, 8, 12, 14 and 43\n"); - hypre_printf("\ndefault solver is 1\n"); - hypre_printf("\n"); + hypre_printf(" -postinterptype : invokes no. of Jacobi interpolation steps after main interpolation\n"); + hypre_printf("\n"); + hypre_printf(" -cgcitr : set maximal number of coarsening iterations for CGC\n"); + hypre_printf(" -solver_type : sets solver within Hybrid solver\n"); + hypre_printf(" : 1 PCG (default)\n"); + hypre_printf(" : 2 GMRES\n"); + hypre_printf(" : 3 BiCGSTAB\n"); + + hypre_printf(" -w : set Jacobi relax weight = val\n"); + hypre_printf(" -k : dimension Krylov space for GMRES\n"); + hypre_printf(" -aug : number of augmentation vectors for LGMRES (-k indicates total approx space size)\n"); + + hypre_printf(" -mxl : maximum number of levels (AMG, ParaSAILS)\n"); + hypre_printf(" -tol : set solver convergence tolerance = val\n"); + hypre_printf(" -atol : set solver absolute convergence tolerance = val\n"); + hypre_printf(" -max_iter : set max iterations\n"); + hypre_printf(" -mg_max_iter : set max iterations for mg solvers\n"); + hypre_printf(" -agg_nl : set number of aggressive coarsening levels (default:0)\n"); + hypre_printf(" -np : set number of paths of length 2 for aggr. coarsening\n"); + hypre_printf("\n"); + hypre_printf(" -sai_th : set ParaSAILS threshold = val \n"); + hypre_printf(" -sai_filt : set ParaSAILS filter = val \n"); + hypre_printf("\n"); + hypre_printf(" -level : set k in ILU(k) for Euclid \n"); + hypre_printf(" -bj : enable block Jacobi ILU for Euclid \n"); + hypre_printf(" -ilut : set drop tolerance for ILUT in Euclid\n"); + hypre_printf(" Note ILUT is sequential only!\n"); + hypre_printf(" -sparseA : set drop tolerance in ILU(k) for Euclid \n"); + hypre_printf(" -rowScale : enable row scaling in Euclid \n"); + hypre_printf("\n"); + hypre_printf(" -drop_tol : set threshold for dropping in PILUT\n"); + hypre_printf(" -nonzeros_to_keep : number of nonzeros in each row to keep\n"); + hypre_printf("\n"); + hypre_printf(" -iout : set output flag\n"); + hypre_printf(" 0=no output 1=matrix stats\n"); + hypre_printf(" 2=cycle stats 3=matrix & cycle stats\n"); + hypre_printf("\n"); + hypre_printf(" -dbg : set debug flag\n"); + hypre_printf(" 0=no debugging\n 1=internal timing\n 2=interpolation truncation\n 3=more detailed timing in coarsening routine\n"); + hypre_printf("\n"); + hypre_printf(" -print : print out the system\n"); + hypre_printf("\n"); + /* begin lobpcg */ - /* end lobpcg */ + hypre_printf("LOBPCG options:\n"); + hypre_printf("\n"); + hypre_printf(" -lobpcg : run LOBPCG instead of PCG\n"); + hypre_printf("\n"); + hypre_printf(" -gen : solve generalized EVP with B = Laplacian\n"); + hypre_printf("\n"); + hypre_printf(" -con : solve constrained EVP using 'vectors.*.*'\n"); + hypre_printf(" as constraints (see -vout 1 below)\n"); + hypre_printf("\n"); + hypre_printf(" -solver none : no HYPRE preconditioner is used\n"); + hypre_printf("\n"); + hypre_printf(" -itr : maximal number of LOBPCG iterations\n"); + hypre_printf(" (default 100);\n"); + hypre_printf("\n"); + hypre_printf(" -vrand : compute eigenpairs using random\n"); + hypre_printf(" initial vectors (default 1)\n"); + hypre_printf("\n"); + hypre_printf(" -seed : use as the seed for the random\n"); + hypre_printf(" number generator(default seed is based\n"); + hypre_printf(" on the time of the run)\n"); + hypre_printf("\n"); + hypre_printf(" -vfromfile : read initial vectors from files\n"); + hypre_printf(" vectors.i.j where i is vector number\n"); + hypre_printf(" and j is processor number\n"); + hypre_printf("\n"); + hypre_printf(" -orthchk : check eigenvectors for orthonormality\n"); + hypre_printf("\n"); + hypre_printf(" -verb : verbosity level\n"); + hypre_printf(" -verb 0 : no print\n"); + hypre_printf(" -verb 1 : print initial eigenvalues and residuals,\n"); + hypre_printf(" the iteration number, the number of\n"); + hypre_printf(" non-convergent eigenpairs and final\n"); + hypre_printf(" eigenvalues and residuals (default)\n"); + hypre_printf(" -verb 2 : print eigenvalues and residuals on each\n"); + hypre_printf(" iteration\n"); + hypre_printf("\n"); + hypre_printf(" -pcgitr : maximal number of inner PCG iterations\n"); + hypre_printf(" for preconditioning (default 1);\n"); + hypre_printf(" if = 0 then the preconditioner\n"); + hypre_printf(" is applied directly\n"); + hypre_printf("\n"); + hypre_printf(" -pcgtol : residual tolerance for inner iterations\n"); + hypre_printf(" (default 0.01)\n"); + hypre_printf("\n"); + hypre_printf(" -vout : file output level\n"); + hypre_printf(" -vout 0 : no files created (default)\n"); + hypre_printf(" -vout 1 : write eigenvalues to values.txt, residuals\n"); + hypre_printf(" to residuals.txt and eigenvectors to \n"); + hypre_printf(" vectors.i.j where i is vector number\n"); + hypre_printf(" and j is processor number\n"); + hypre_printf(" -vout 2 : in addition to the above, write the\n"); + hypre_printf(" eigenvalues history (the matrix whose\n"); + hypre_printf(" i-th column contains eigenvalues at\n"); + hypre_printf(" (i+1)-th iteration) to val_hist.txt and\n"); + hypre_printf(" residuals history to res_hist.txt\n"); + hypre_printf("\nNOTE: in this test driver LOBPCG only works with solvers 1, 2, 8, 12, 14 and 43\n"); + hypre_printf("\ndefault solver is 1\n"); + hypre_printf("\n"); - hypre_printf(" -plot_grids : print out information for plotting the grids\n"); - hypre_printf(" -plot_file_name : file name for plotting output\n"); - hypre_printf("\n"); - hypre_printf(" -smtype :smooth type\n"); - hypre_printf(" -smlv :smooth num levels\n"); - hypre_printf(" -ov :over lap:\n"); - hypre_printf(" -dom :domain type\n"); - hypre_printf(" -use_ns : use non-symm schwarz smoother\n"); - hypre_printf(" -var : schwarz smoother variant (0-3) \n"); - hypre_printf(" -blk_sm : same as '-smtype 6 -ov 0 -dom 1 -smlv '\n"); - hypre_printf(" -nongalerk_tol : specify the NonGalerkin drop tolerance\n"); - hypre_printf(" and list contains the values, where last value\n"); - hypre_printf(" in list is repeated if val < num_levels in AMG\n"); - exit(1); + /* end lobpcg */ + + hypre_printf(" -plot_grids : print out information for plotting the grids\n"); + hypre_printf(" -plot_file_name : file name for plotting output\n"); + hypre_printf("\n"); + hypre_printf(" -smtype :smooth type\n"); + hypre_printf(" -smlv :smooth num levels\n"); + hypre_printf(" -ov :over lap:\n"); + hypre_printf(" -dom :domain type\n"); + hypre_printf(" -use_ns : use non-symm schwarz smoother\n"); + hypre_printf(" -var : schwarz smoother variant (0-3) \n"); + hypre_printf(" -blk_sm : same as '-smtype 6 -ov 0 -dom 1 -smlv '\n"); + hypre_printf(" -nongalerk_tol : specify the NonGalerkin drop tolerance\n"); + hypre_printf(" and list contains the values, where last value\n"); + hypre_printf(" in list is repeated if val < num_levels in AMG\n"); + + /* MGR options */ + hypre_printf(" -mgr_bsize : set block size = val\n"); + hypre_printf(" -mgr_nlevels : set number of coarsening levels = val\n"); + hypre_printf(" -mgr_num_reserved_nodes : set number of reserved nodes \n"); + hypre_printf(" to be kept till the coarsest grid = val\n"); + hypre_printf(" -mgr_non_c_to_f : set strategy for intermediate coarse grid \n"); + hypre_printf(" -mgr_non_c_to_f 0 : Allow some non Cpoints to be labeled \n"); + hypre_printf(" Cpoints on intermediate grid \n"); + hypre_printf(" -mgr_non_c_to_f 1 : set non Cpoints strictly to Fpoints \n"); + hypre_printf(" -mgr_frelax_method : set F-relaxation strategy \n"); + hypre_printf(" -mgr_frelax_method 0 : Use 'single-level smoother' strategy \n"); + hypre_printf(" for F-relaxation \n"); + hypre_printf(" -mgr_frelax_method 1 : Use a 'multi-level smoother' strategy \n"); + hypre_printf(" for F-relaxation \n"); + /* end MGR options */ + } + + goto final; } /*----------------------------------------------------------- @@ -1636,7 +1731,8 @@ else if ( build_matrix_type == 6 ) { BuildParVarDifConv(argc, argv, build_matrix_arg_index, &parcsr_A, &b); - /*HYPRE_ParCSRMatrixPrint(parcsr_A,"mat100");*/ + build_rhs_type = 6; + build_src_type = 5; } else if ( build_matrix_type == 7 ) { @@ -2163,6 +2259,10 @@ ierr = HYPRE_IJVectorGetObject( ij_x, &object ); x = (HYPRE_ParVector) object; } + else if ( build_rhs_type == 6) + { + ij_b = NULL; + } if ( build_src_type == 0 ) { @@ -2328,6 +2428,33 @@ ierr = HYPRE_IJVectorGetObject( ij_x, &object ); x = (HYPRE_ParVector) object; } + else if ( build_src_type == 5 ) + { + if (myid == 0) + { + hypre_printf(" Initial guess is random \n"); + } + + /* Initial guess */ + HYPRE_IJVectorCreate(hypre_MPI_COMM_WORLD, first_local_col, last_local_col, &ij_x); + HYPRE_IJVectorSetObjectType(ij_x, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(ij_x); + + /* For backward Euler the previous backward Euler iterate (assumed + random in 0 - 1 here) is usually used as the initial guess */ + values = hypre_CTAlloc(HYPRE_Real, local_num_cols); + /* hypre_SeedRand(myid+2747); */ + hypre_SeedRand(myid); + for (i = 0; i < local_num_cols; i++) + { + values[i] = hypre_Rand(); + } + HYPRE_IJVectorSetValues(ij_x, local_num_cols, NULL, values); + hypre_TFree(values); + + ierr = HYPRE_IJVectorGetObject( ij_x, &object ); + x = (HYPRE_ParVector) object; + } hypre_EndTiming(time_index); hypre_PrintTiming("IJ Vector Setup", hypre_MPI_COMM_WORLD); @@ -2359,7 +2486,14 @@ if (print_system) { HYPRE_IJMatrixPrint(ij_A, "IJ.out.A"); - HYPRE_IJVectorPrint(ij_b, "IJ.out.b"); + if (ij_b) + { + HYPRE_IJVectorPrint(ij_b, "IJ.out.b"); + } + else if (b) + { + HYPRE_ParVectorPrint(b, "ParVec.out.b"); + } HYPRE_IJVectorPrint(ij_x, "IJ.out.x0"); /* HYPRE_ParCSRMatrixPrint( parcsr_A, "new_mat.A" );*/ @@ -2497,6 +2631,9 @@ HYPRE_BoomerAMGSetAddRelaxWt(amg_solver, add_relax_wt); HYPRE_BoomerAMGSetChebyOrder(amg_solver, cheby_order); HYPRE_BoomerAMGSetChebyFraction(amg_solver, cheby_fraction); + HYPRE_BoomerAMGSetChebyEigEst(amg_solver, cheby_eig_est); + HYPRE_BoomerAMGSetChebyVariant(amg_solver, cheby_variant); + HYPRE_BoomerAMGSetChebyScale(amg_solver, cheby_scale); HYPRE_BoomerAMGSetRelaxOrder(amg_solver, relax_order); HYPRE_BoomerAMGSetRelaxWt(amg_solver, relax_wt); HYPRE_BoomerAMGSetOuterWt(amg_solver, outer_wt); @@ -2536,6 +2673,7 @@ HYPRE_BoomerAMGSetAdditive(amg_solver, additive); HYPRE_BoomerAMGSetMultAdditive(amg_solver, mult_add); HYPRE_BoomerAMGSetSimple(amg_solver, simple); + HYPRE_BoomerAMGSetAddLastLvl(amg_solver, add_last_lvl); HYPRE_BoomerAMGSetMultAddPMaxElmts(amg_solver, add_P_max_elmts); HYPRE_BoomerAMGSetMultAddTruncFactor(amg_solver, add_trunc_factor); @@ -2653,6 +2791,9 @@ HYPRE_BoomerAMGSetAddRelaxWt(amg_solver, add_relax_wt); HYPRE_BoomerAMGSetChebyOrder(amg_solver, cheby_order); HYPRE_BoomerAMGSetChebyFraction(amg_solver, cheby_fraction); + HYPRE_BoomerAMGSetChebyEigEst(amg_solver, cheby_eig_est); + HYPRE_BoomerAMGSetChebyVariant(amg_solver, cheby_variant); + HYPRE_BoomerAMGSetChebyScale(amg_solver, cheby_scale); HYPRE_BoomerAMGSetRelaxOrder(amg_solver, relax_order); HYPRE_BoomerAMGSetRelaxWt(amg_solver, relax_wt); HYPRE_BoomerAMGSetOuterWt(amg_solver, outer_wt); @@ -2690,6 +2831,7 @@ HYPRE_BoomerAMGSetAdditive(amg_solver, additive); HYPRE_BoomerAMGSetMultAdditive(amg_solver, mult_add); HYPRE_BoomerAMGSetSimple(amg_solver, simple); + HYPRE_BoomerAMGSetAddLastLvl(amg_solver, add_last_lvl); HYPRE_BoomerAMGSetMultAddPMaxElmts(amg_solver, add_P_max_elmts); HYPRE_BoomerAMGSetMultAddTruncFactor(amg_solver, add_trunc_factor); HYPRE_BoomerAMGSetRAP2(amg_solver, rap2); @@ -2760,7 +2902,7 @@ /* begin lobpcg */ if (!lobpcgFlag && (solver_id == 1 || solver_id == 2 || solver_id == 8 || - solver_id == 12 || solver_id == 14 || solver_id == 43)) + solver_id == 12 || solver_id == 14 || solver_id == 43 || solver_id == 71)) /*end lobpcg */ { time_index = hypre_InitializeTiming("PCG Setup"); @@ -2817,6 +2959,9 @@ HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt); HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order); HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction); + HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est); + HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant); + HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale); HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order); HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt); HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt); @@ -2855,6 +3000,7 @@ HYPRE_BoomerAMGSetAdditive(pcg_precond, additive); HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add); HYPRE_BoomerAMGSetSimple(pcg_precond, simple); + HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl); HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts); HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor); HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2); @@ -2972,6 +3118,9 @@ HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order); HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order); HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction); + HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est); + HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant); + HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale); HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt); HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt); if (level_w > -1) @@ -3008,6 +3157,7 @@ HYPRE_BoomerAMGSetAdditive(pcg_precond, additive); HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add); HYPRE_BoomerAMGSetSimple(pcg_precond, simple); + HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl); HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts); HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor); HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2); @@ -3051,6 +3201,91 @@ (HYPRE_PtrToSolverFcn) HYPRE_EuclidSetup, pcg_precond); } + else if( solver_id == 71 ) + { + /* use MGR preconditioning */ + if (myid == 0) hypre_printf("Solver: MGR-PCG\n"); + + HYPRE_MGRCreate(&pcg_precond); + + mgr_num_cindexes = hypre_CTAlloc(HYPRE_Int, mgr_nlevels); + for(i=0; i -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_down, 1); + if (relax_up > -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_up, 2); + if (relax_coarse > -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_coarse, 3); + HYPRE_BoomerAMGSetRelaxOrder(amg_solver, 1); + HYPRE_BoomerAMGSetMaxLevels(amg_solver, max_levels); + HYPRE_BoomerAMGSetSmoothType(amg_solver, smooth_type); + HYPRE_BoomerAMGSetSmoothNumSweeps(amg_solver, smooth_num_sweeps); + HYPRE_BoomerAMGSetMaxIter(amg_solver, 1); + HYPRE_BoomerAMGSetTol(amg_solver, 0.0); + + /* set the MGR coarse solver. Comment out to use default CG solver in MGR */ + HYPRE_MGRSetCoarseSolver( pcg_precond, HYPRE_BoomerAMGSolve, HYPRE_BoomerAMGSetup, amg_solver); + + /* setup MGR-PCG solver */ + HYPRE_PCGSetPrecond(pcg_solver, + (HYPRE_PtrToSolverFcn) HYPRE_MGRSolve, + (HYPRE_PtrToSolverFcn) HYPRE_MGRSetup, + pcg_precond); + + } HYPRE_PCGGetPrecond(pcg_solver, &pcg_precond_gotten); if (pcg_precond_gotten != pcg_precond) @@ -3064,7 +3299,7 @@ HYPRE_PCGSetup(pcg_solver, (HYPRE_Matrix)parcsr_A, (HYPRE_Vector)b, (HYPRE_Vector)x); - + hypre_EndTiming(time_index); hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD); hypre_FinalizeTiming(time_index); @@ -3115,6 +3350,31 @@ { HYPRE_EuclidDestroy(pcg_precond); } + else if(solver_id == 71) + { + /* free memory */ + if(mgr_num_cindexes) + hypre_TFree(mgr_num_cindexes); + mgr_num_cindexes = NULL; + + if(mgr_reserved_coarse_indexes) + hypre_TFree(mgr_reserved_coarse_indexes); + mgr_reserved_coarse_indexes = NULL; + + if(mgr_cindexes) + { + for( i=0; i -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_down, 1); + if (relax_up > -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_up, 2); + if (relax_coarse > -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_coarse, 3); + HYPRE_BoomerAMGSetRelaxOrder(amg_solver, 1); + HYPRE_BoomerAMGSetMaxLevels(amg_solver, max_levels); + HYPRE_BoomerAMGSetSmoothType(amg_solver, smooth_type); + HYPRE_BoomerAMGSetSmoothNumSweeps(amg_solver, smooth_num_sweeps); + HYPRE_BoomerAMGSetMaxIter(amg_solver, 1); + HYPRE_BoomerAMGSetTol(amg_solver, 0.0); + + /* set the MGR coarse solver. Comment out to use default CG solver in MGR */ + HYPRE_MGRSetCoarseSolver( pcg_precond, HYPRE_BoomerAMGSolve, HYPRE_BoomerAMGSetup, amg_solver); + + /* setup MGR-PCG solver */ + HYPRE_FlexGMRESSetMaxIter(pcg_solver, mg_max_iter); + HYPRE_FlexGMRESSetPrecond(pcg_solver, + (HYPRE_PtrToSolverFcn) HYPRE_MGRSolve, + (HYPRE_PtrToSolverFcn) HYPRE_MGRSetup, + pcg_precond); + } else if (solver_id == 60) { /* use diagonal scaling as preconditioner */ @@ -4774,6 +5135,9 @@ HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt); HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order); HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction); + HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est); + HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant); + HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale); HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order); HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt); HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt); @@ -4813,6 +5177,7 @@ HYPRE_BoomerAMGSetAdditive(pcg_precond, additive); HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add); HYPRE_BoomerAMGSetSimple(pcg_precond, simple); + HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl); HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts); HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor); HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2); @@ -4996,6 +5361,9 @@ HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt); HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order); HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction); + HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est); + HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant); + HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale); HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order); HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt); HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt); @@ -5027,6 +5395,7 @@ HYPRE_BoomerAMGSetAdditive(pcg_precond, additive); HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add); HYPRE_BoomerAMGSetSimple(pcg_precond, simple); + HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl); HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts); HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor); HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2); @@ -5113,12 +5482,164 @@ } /*----------------------------------------------------------- + * Solve the system using MGR + *-----------------------------------------------------------*/ + + if (solver_id == 70) + { + if (myid == 0) hypre_printf("Solver: MGR\n"); + time_index = hypre_InitializeTiming("MGR Setup"); + hypre_BeginTiming(time_index); + + HYPRE_Solver mgr_solver; + HYPRE_MGRCreate(&mgr_solver); + + mgr_num_cindexes = hypre_CTAlloc(HYPRE_Int, mgr_nlevels); + for(i=0; i -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_down, 1); + if (relax_up > -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_up, 2); + if (relax_coarse > -1) + HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_coarse, 3); + HYPRE_BoomerAMGSetRelaxOrder(amg_solver, 1); + HYPRE_BoomerAMGSetMaxLevels(amg_solver, max_levels); + HYPRE_BoomerAMGSetSmoothType(amg_solver, smooth_type); + HYPRE_BoomerAMGSetSmoothNumSweeps(amg_solver, smooth_num_sweeps); + if(mgr_nlevels < 1 || mgr_bsize < 2) + { + HYPRE_BoomerAMGSetMaxIter(amg_solver, max_iter); + HYPRE_BoomerAMGSetPrintLevel(amg_solver, 3); + } + else + { + HYPRE_BoomerAMGSetMaxIter(amg_solver, 1); + HYPRE_BoomerAMGSetTol(amg_solver, 0.0); + HYPRE_BoomerAMGSetPrintLevel(amg_solver, 1); + } + /* set the MGR coarse solver. Comment out to use default CG solver in MGR */ + HYPRE_MGRSetCoarseSolver( mgr_solver, HYPRE_BoomerAMGSolve, HYPRE_BoomerAMGSetup, amg_solver); + + /* setup MGR solver */ + HYPRE_MGRSetup(mgr_solver, parcsr_A, b, x); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + time_index = hypre_InitializeTiming("MGR Solve"); + hypre_BeginTiming(time_index); + + /* MGR solve */ + HYPRE_MGRSolve(mgr_solver, parcsr_A, b, x); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", hypre_MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + HYPRE_MGRGetNumIterations(mgr_solver, &num_iterations); + HYPRE_MGRGetFinalRelativeResidualNorm(mgr_solver, &final_res_norm); + + if (myid == 0) + { + hypre_printf("\n"); + hypre_printf("MGR Iterations = %d\n", num_iterations); + hypre_printf("Final Relative Residual Norm = %e\n", final_res_norm); + hypre_printf("\n"); + } + +#if SECOND_TIME + /* run a second time to check for memory leaks */ + HYPRE_ParVectorSetRandomValues(x, 775); + HYPRE_MGRSetup(mgr_solver, parcsr_A, b, x); + HYPRE_MGRSolve(mgr_solver, parcsr_A, b, x); +#endif + + /* free memory */ + if(mgr_num_cindexes) + hypre_TFree(mgr_num_cindexes); + mgr_num_cindexes = NULL; + + if(mgr_reserved_coarse_indexes) + hypre_TFree(mgr_reserved_coarse_indexes); + mgr_reserved_coarse_indexes = NULL; + + if(mgr_cindexes) + { + for( i=0; i a) ); +} + +/*---------------------------------------------------------------------- * Build standard 7-point convection-diffusion operator * Parameters given in command line. * Operator: @@ -5774,18 +6308,19 @@ BuildParDifConv( HYPRE_Int argc, char *argv[], HYPRE_Int arg_index, - HYPRE_ParCSRMatrix *A_ptr ) + HYPRE_ParCSRMatrix *A_ptr) { - HYPRE_Int nx, ny, nz; - HYPRE_Int P, Q, R; + HYPRE_Int nx, ny, nz; + HYPRE_Int P, Q, R; HYPRE_Real cx, cy, cz; - HYPRE_Real ax, ay, az; + HYPRE_Real ax, ay, az, atype; HYPRE_Real hinx,hiny,hinz; + HYPRE_Int sign_prod; HYPRE_ParCSRMatrix A; - HYPRE_Int num_procs, myid; - HYPRE_Int p, q, r; + HYPRE_Int num_procs, myid; + HYPRE_Int p, q, r; HYPRE_Real *values; /*----------------------------------------------------------- @@ -5815,6 +6350,8 @@ ay = 1.; az = 1.; + atype = 0; + /*----------------------------------------------------------- * Parse command line *-----------------------------------------------------------*/ @@ -5849,6 +6386,11 @@ ay = atof(argv[arg_index++]); az = atof(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-atype") == 0 ) + { + arg_index++; + atype = atoi(argv[arg_index++]); + } else { arg_index++; @@ -5895,28 +6437,146 @@ /*----------------------------------------------------------- * Generate the matrix *-----------------------------------------------------------*/ - + /* values[7]: + * [0]: center + * [1]: X- + * [2]: Y- + * [3]: Z- + * [4]: X+ + * [5]: Y+ + * [6]: Z+ + */ values = hypre_CTAlloc(HYPRE_Real, 7); - values[1] = -cx/(hinx*hinx); - values[2] = -cy/(hiny*hiny); - values[3] = -cz/(hinz*hinz); - values[4] = -cx/(hinx*hinx) + ax/hinx; - values[5] = -cy/(hiny*hiny) + ay/hiny; - values[6] = -cz/(hinz*hinz) + az/hinz; - values[0] = 0.; - if (nx > 1) + + if (0 == atype) /* forward scheme for conv */ { - values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx; + values[1] = -cx/(hinx*hinx); + values[2] = -cy/(hiny*hiny); + values[3] = -cz/(hinz*hinz); + values[4] = -cx/(hinx*hinx) + ax/hinx; + values[5] = -cy/(hiny*hiny) + ay/hiny; + values[6] = -cz/(hinz*hinz) + az/hinz; + + if (nx > 1) + { + values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx; + } + if (ny > 1) + { + values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny; + } + if (nz > 1) + { + values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz; + } + } + else if (1 == atype) /* backward scheme for conv */ + { + values[1] = -cx/(hinx*hinx) - ax/hinx; + values[2] = -cy/(hiny*hiny) - ay/hiny; + values[3] = -cz/(hinz*hinz) - az/hinz; + values[4] = -cx/(hinx*hinx); + values[5] = -cy/(hiny*hiny); + values[6] = -cz/(hinz*hinz); + + if (nx > 1) + { + values[0] += 2.0*cx/(hinx*hinx) + 1.*ax/hinx; + } + if (ny > 1) + { + values[0] += 2.0*cy/(hiny*hiny) + 1.*ay/hiny; + } + if (nz > 1) + { + values[0] += 2.0*cz/(hinz*hinz) + 1.*az/hinz; + } } - if (ny > 1) + else if (3 == atype) /* upwind scheme */ { - values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny; + sign_prod = sign_double(cx) * sign_double(ax); + if (sign_prod == 1) /* same sign use back scheme */ + { + values[1] = -cx/(hinx*hinx) - ax/hinx; + values[4] = -cx/(hinx*hinx); + if (nx > 1) + { + values[0] += 2.0*cx/(hinx*hinx) + 1.*ax/hinx; + } + } + else /* diff sign use forward scheme */ + { + values[1] = -cx/(hinx*hinx); + values[4] = -cx/(hinx*hinx) + ax/hinx; + if (nx > 1) + { + values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx; + } + } + + sign_prod = sign_double(cy) * sign_double(ay); + if (sign_prod == 1) /* same sign use back scheme */ + { + values[2] = -cy/(hiny*hiny) - ay/hiny; + values[5] = -cy/(hiny*hiny); + if (ny > 1) + { + values[0] += 2.0*cy/(hiny*hiny) + 1.*ay/hiny; + } + } + else /* diff sign use forward scheme */ + { + values[2] = -cy/(hiny*hiny); + values[5] = -cy/(hiny*hiny) + ay/hiny; + if (ny > 1) + { + values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny; + } + } + + sign_prod = sign_double(cz) * sign_double(az); + if (sign_prod == 1) /* same sign use back scheme */ + { + values[3] = -cz/(hinz*hinz) - az/hinz; + values[6] = -cz/(hinz*hinz); + if (nz > 1) + { + values[0] += 2.0*cz/(hinz*hinz) + 1.*az/hinz; + } + } + else /* diff sign use forward scheme */ + { + values[3] = -cz/(hinz*hinz); + values[6] = -cz/(hinz*hinz) + az/hinz; + if (nz > 1) + { + values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz; + } + } } - if (nz > 1) + else /* centered difference scheme */ { - values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz; + values[1] = -cx/(hinx*hinx) - ax/(2.*hinx); + values[2] = -cy/(hiny*hiny) - ay/(2.*hiny); + values[3] = -cz/(hinz*hinz) - az/(2.*hinz); + values[4] = -cx/(hinx*hinx) + ax/(2.*hinx); + values[5] = -cy/(hiny*hiny) + ay/(2.*hiny); + values[6] = -cz/(hinz*hinz) + az/(2.*hinz); + + if (nx > 1) + { + values[0] += 2.0*cx/(hinx*hinx); + } + if (ny > 1) + { + values[0] += 2.0*cy/(hiny*hiny); + } + if (nz > 1) + { + values[0] += 2.0*cz/(hinz*hinz); + } } A = (HYPRE_ParCSRMatrix) GenerateDifConv(hypre_MPI_COMM_WORLD, @@ -6580,8 +7240,9 @@ HYPRE_ParCSRMatrix A; HYPRE_ParVector rhs; - HYPRE_Int num_procs, myid; - HYPRE_Int p, q, r; + HYPRE_Int num_procs, myid; + HYPRE_Int p, q, r; + HYPRE_Int type; HYPRE_Real eps; /*----------------------------------------------------------- @@ -6601,6 +7262,11 @@ P = 1; Q = num_procs; R = 1; + eps = 1.0; + + /* type: 0 : default FD; + * 1-3 : FD and examples 1-3 in Ruge-Stuben paper */ + type = 0; /*----------------------------------------------------------- * Parse command line @@ -6627,6 +7293,11 @@ arg_index++; eps = atof(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-vardifconvRS") == 0 ) + { + arg_index++; + type = atoi(argv[arg_index++]); + } else { arg_index++; @@ -6667,8 +7338,17 @@ * Generate the matrix *-----------------------------------------------------------*/ - A = (HYPRE_ParCSRMatrix) GenerateVarDifConv(hypre_MPI_COMM_WORLD, - nx, ny, nz, P, Q, R, p, q, r, eps, &rhs); + if (0 == type) + { + A = (HYPRE_ParCSRMatrix) GenerateVarDifConv(hypre_MPI_COMM_WORLD, + nx, ny, nz, P, Q, R, p, q, r, eps, &rhs); + } + else + { + A = (HYPRE_ParCSRMatrix) GenerateRSVarDifConv(hypre_MPI_COMM_WORLD, + nx, ny, nz, P, Q, R, p, q, r, eps, &rhs, + type); + } *A_ptr = A; *rhs_ptr = rhs; diff -Nru hypre-2.11.2/src/test/Makefile hypre-2.13.0/src/test/Makefile --- hypre-2.11.2/src/test/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -10,10 +10,12 @@ # $Revision$ #EHEADER********************************************************************** +default:all include ../config/Makefile.config CINCLUDES = ${INCLUDES} ${MPIINCLUDE} + CDEFS = -DHYPRE_TIMING -DHYPRE_FORTRAN CXXDEFS = -DNOFEI -DHYPRE_TIMING -DMPICH_SKIP_MPICXX @@ -162,6 +164,10 @@ @echo "Building" $@ "... " ${LINK_CC} -o $@ $@.o ${LFLAGS} +struct_newboxloop: struct_newboxloop.o $(KOKKOS_LINK_DEPENDS) + @echo "Building" $@ "... " + ${LINK_CC} -o $@ $@.o ${LFLAGS} + # RDF: Keep these for now hypre_set_precond: hypre_set_precond.o diff -Nru hypre-2.11.2/src/test/maxwell_unscaled.c hypre-2.13.0/src/test/maxwell_unscaled.c --- hypre-2.11.2/src/test/maxwell_unscaled.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/maxwell_unscaled.c 2017-10-20 17:42:22.000000000 +0000 @@ -1336,6 +1336,8 @@ hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid); + hypre_GPUInit(-1); + hypre_InitMemoryDebug(myid); /*----------------------------------------------------------- @@ -1741,7 +1743,7 @@ for (j = 0; j < data.max_boxsize; j++) { values[j]= sin((HYPRE_Real)(j+1)); - values[j]= (HYPRE_Real) rand()/RAND_MAX; + values[j]= (HYPRE_Real) hypre_Rand(); values[j]= (HYPRE_Real) j; } for (part = 0; part < data.nparts; part++) @@ -1900,6 +1902,7 @@ hypre_FinalizeMemoryDebug(); /* Finalize MPI */ + hypre_GPUFinalize(); hypre_MPI_Finalize(); return (0); diff -Nru hypre-2.11.2/src/test/runtest.sh hypre-2.13.0/src/test/runtest.sh --- hypre-2.11.2/src/test/runtest.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/runtest.sh 2017-10-20 17:42:22.000000000 +0000 @@ -82,7 +82,7 @@ # RunString="${RunString} -nodes $POE_NUM_NODES $MY_ARGS" RunString="poe $MY_ARGS -rmpool pdebug -procs $POE_NUM_PROCS -nodes $POE_NUM_NODES" ;; - rzzeus*|rzmerl*|ansel*|aztec*|cab*|sierra*|vulcan*) + rztopaz*|aztec*|cab*|quartz*|sierra*|syrah*|vulcan*) shift if [ $NumThreads -gt 0 ] ; then export OMP_NUM_THREADS=$NumThreads diff -Nru hypre-2.11.2/src/test/sstruct.c hypre-2.13.0/src/test/sstruct.c --- hypre-2.11.2/src/test/sstruct.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/sstruct.c 2017-10-20 17:42:22.000000000 +0000 @@ -466,7 +466,6 @@ sdata_line = fgets((sdata + sdata_size), maxline, file); } } - /* broadcast the data size */ hypre_MPI_Bcast(&sdata_size, 1, HYPRE_MPI_INT, 0, hypre_MPI_COMM_WORLD); @@ -2419,10 +2418,15 @@ /* Initialize MPI */ hypre_MPI_Init(&argc, &argv); +#if defined(HYPRE_USE_KOKKOS) + Kokkos::InitArguments args; + args.num_threads = 10; + Kokkos::initialize (args); +#endif hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid); - + hypre_GPUInit(-1); hypre_InitMemoryDebug(myid); /*----------------------------------------------------------- @@ -5734,6 +5738,10 @@ hypre_FinalizeMemoryDebug(); /* Finalize MPI */ + hypre_GPUFinalize(); +#if defined(HYPRE_USE_KOKKOS) + Kokkos::finalize (); +#endif hypre_MPI_Finalize(); return (0); diff -Nru hypre-2.11.2/src/test/struct.c hypre-2.13.0/src/test/struct.c --- hypre-2.11.2/src/test/struct.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/struct.c 2017-10-20 17:42:22.000000000 +0000 @@ -188,7 +188,12 @@ /* Initialize MPI */ hypre_MPI_Init(&argc, &argv); - +#if defined(HYPRE_USE_KOKKOS) + Kokkos::InitArguments args; + args.num_threads = 10; + Kokkos::initialize (args); +#endif + hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs ); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); @@ -839,7 +844,7 @@ * Set up the stencil structure needed for matrix creation * which is always the case for read_fromfile_param == 0 *-----------------------------------------------------------*/ - + HYPRE_StructStencilCreate(dim, (2-sym)*dim + 1, &stencil); for (s = 0; s < (2-sym)*dim + 1; s++) { @@ -884,7 +889,6 @@ (read_rhsfromfile_param ==1) ) { - hypre_printf("\nreading linear system from files: matrix, rhs and x0\n"); /* ghost selection for reading the matrix and vectors */ for (i = 0; i < dim; i++) { @@ -970,12 +974,14 @@ HYPRE_StructGridSetPeriodic(grid, periodic); HYPRE_StructGridSetNumGhost(grid, num_ghost); HYPRE_StructGridAssemble(grid); - + + /*----------------------------------------------------------- * Set up the matrix structure *-----------------------------------------------------------*/ - + HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, grid, stencil, &A); + if ( solver_id == 3 || solver_id == 4 || solver_id == 13 || solver_id == 14 ) { @@ -1008,13 +1014,14 @@ constant_coefficient = 2; } } + HYPRE_StructMatrixSetSymmetric(A, sym); HYPRE_StructMatrixInitialize(A); /*----------------------------------------------------------- * Fill in the matrix elements *-----------------------------------------------------------*/ - + AddValuesMatrix(A,grid,cx,cy,cz,conx,cony,conz); /* Zero out stencils reaching to real boundary */ @@ -1022,7 +1029,6 @@ if ( constant_coefficient == 0 ) SetStencilBndry(A,grid,periodic); HYPRE_StructMatrixAssemble(A); - /*----------------------------------------------------------- * Set up the linear system *-----------------------------------------------------------*/ @@ -1041,7 +1047,7 @@ HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x); HYPRE_StructVectorInitialize(x); - + AddValuesVector(grid,x,periodx0,0.0); HYPRE_StructVectorAssemble(x); @@ -2765,6 +2771,9 @@ } /* Finalize MPI */ +#if defined(HYPRE_USE_KOKKOS) + Kokkos::finalize (); +#endif hypre_MPI_Finalize(); return (0); @@ -2799,8 +2808,8 @@ { box = hypre_BoxArrayBox(gridboxes, ib); volume = hypre_BoxVolume(box); - values = hypre_CTAlloc(HYPRE_Real, volume); - + values = hypre_UMCTAlloc(HYPRE_Real, volume); + /*----------------------------------------------------------- * For periodic b.c. in all directions, need rhs to satisfy * compatibility condition. Achieved by setting a source and @@ -2827,8 +2836,10 @@ ilower = hypre_BoxIMin(box); iupper = hypre_BoxIMax(box); + HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values); - hypre_TFree(values); + hypre_UMTFree(values); + } @@ -2900,8 +2911,8 @@ { box = hypre_BoxArrayBox(gridboxes, bi); volume = hypre_BoxVolume(box); - values = hypre_CTAlloc(HYPRE_Real, stencil_size*volume); - + values = hypre_UMCTAlloc(HYPRE_Real, stencil_size*volume); + for (i = 0; i < stencil_size*volume; i += stencil_size) { switch (dim) @@ -2925,14 +2936,16 @@ } ilower = hypre_BoxIMin(box); iupper = hypre_BoxIMax(box); + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size, stencil_indices, values); - hypre_TFree(values); + hypre_UMTFree(values); + } } else if ( constant_coefficient==1 ) { - values = hypre_CTAlloc(HYPRE_Real, stencil_size); + values = hypre_CTAlloc(HYPRE_Real, stencil_size); switch (dim) { case 1: @@ -2963,7 +2976,7 @@ hypre_assert( constant_coefficient==2 ); /* stencil index for the center equals dim, so it's easy to leave out */ - values = hypre_CTAlloc(HYPRE_Real, stencil_size-1); + values = hypre_UMCTAlloc(HYPRE_Real, stencil_size-1); switch (dim) { case 1: @@ -2984,14 +2997,13 @@ HYPRE_StructMatrixSetConstantValues(A, stencil_size-1, stencil_indices, values); } - hypre_TFree(values); + hypre_UMTFree(values); hypre_ForBoxI(bi, gridboxes) { box = hypre_BoxArrayBox(gridboxes, bi); volume = hypre_BoxVolume(box); - values = hypre_CTAlloc(HYPRE_Real, volume); - + values = hypre_UMCTAlloc(HYPRE_Real, volume); for ( i=0; i < volume; ++i ) { values[i] = center; @@ -3000,7 +3012,7 @@ iupper = hypre_BoxIMax(box); HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, stencil_indices+dim, values); - hypre_TFree(values); + hypre_UMTFree(values); } } } @@ -3043,7 +3055,7 @@ { box = hypre_BoxArrayBox(gridboxes, bi); volume = hypre_BoxVolume(box); - values = hypre_CTAlloc(HYPRE_Real, stencil_size*volume); + values = hypre_UMCTAlloc(HYPRE_Real, stencil_size*volume); for (i = 0; i < stencil_size*volume; i += stencil_size) { @@ -3077,7 +3089,7 @@ HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size, stencil_indices, values); - hypre_TFree(values); + hypre_UMTFree(values); } } else if ( constant_coefficient==1 ) @@ -3120,7 +3132,7 @@ else { hypre_assert( constant_coefficient==2 ); - values = hypre_CTAlloc( HYPRE_Real, stencil_size-1 ); + values = hypre_UMCTAlloc( HYPRE_Real, stencil_size-1 ); switch (dim) { /* no center in stencil_indices and values */ case 1: @@ -3160,7 +3172,7 @@ HYPRE_StructMatrixSetConstantValues(A, stencil_size, stencil_indices, values); } - hypre_TFree(values); + hypre_UMTFree(values); /* center is variable */ @@ -3169,7 +3181,7 @@ { box = hypre_BoxArrayBox(gridboxes, bi); volume = hypre_BoxVolume(box); - values = hypre_CTAlloc(HYPRE_Real, volume); + values = hypre_UMCTAlloc(HYPRE_Real, volume); for ( i=0; i < volume; ++i ) { @@ -3179,7 +3191,7 @@ iupper = hypre_BoxIMax(box); HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, stencil_indices, values); - hypre_TFree(values); + hypre_UMTFree(values); } } } @@ -3260,7 +3272,7 @@ { for (ib = 0; ib < size; ib++) { - values = hypre_CTAlloc(HYPRE_Real, vol[ib]); + values = hypre_UMCTAlloc(HYPRE_Real, vol[ib]); for (i = 0; i < vol[ib]; i++) { @@ -3286,7 +3298,7 @@ 1, stencil_indices, values); ilower[ib][d] = j; } - hypre_TFree(values); + hypre_UMTFree(values); } } } diff -Nru hypre-2.11.2/src/test/struct_migrate.c hypre-2.13.0/src/test/struct_migrate.c --- hypre-2.11.2/src/test/struct_migrate.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/struct_migrate.c 2017-10-20 17:42:22.000000000 +0000 @@ -65,7 +65,11 @@ /* Initialize MPI */ hypre_MPI_Init(&argc, &argv); - +#if defined(HYPRE_USE_KOKKOS) + Kokkos::InitArguments args; + args.num_threads = 10; + Kokkos::initialize (args); +#endif hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs ); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); @@ -362,7 +366,7 @@ if (myid == 0) { - printf("\nCheck = %1.0f (success = 0)\n\n", check); + hypre_printf("\nCheck = %1.0f (success = 0)\n\n", check); } /*----------------------------------------------------------- @@ -397,6 +401,9 @@ HYPRE_StructVectorDestroy(check_vector); /* Finalize MPI */ +#if defined(HYPRE_USE_KOKKOS) + Kokkos::finalize (); +#endif hypre_MPI_Finalize(); return (0); @@ -413,7 +420,7 @@ { HYPRE_Int ierr = 0; hypre_BoxArray *gridboxes; - HYPRE_Int i,ib; + HYPRE_Int ib; hypre_IndexRef ilower; hypre_IndexRef iupper; hypre_Box *box; @@ -426,18 +433,19 @@ hypre_ForBoxI(ib, gridboxes) { box = hypre_BoxArrayBox(gridboxes, ib); - volume = hypre_BoxVolume(box); - values = hypre_CTAlloc(HYPRE_Real, volume); + volume = hypre_BoxVolume(box); + values = hypre_DeviceCTAlloc(HYPRE_Real, volume); - for (i = 0; i < volume; i++) + hypre_LoopBegin(volume,i) { values[i] = value; } - + hypre_LoopEnd(); + ilower = hypre_BoxIMin(box); iupper = hypre_BoxIMax(box); HYPRE_StructVectorSetBoxValues(vector, ilower, iupper, values); - hypre_TFree(values); + hypre_DeviceTFree(values); } return ierr; diff -Nru hypre-2.11.2/src/test/struct_newboxloop.c hypre-2.13.0/src/test/struct_newboxloop.c --- hypre-2.11.2/src/test/struct_newboxloop.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/struct_newboxloop.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,1956 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#include +#include +#include +#include +#include + +#include "_hypre_utilities.h" +#include "HYPRE_struct_ls.h" +#include "HYPRE_krylov.h" + +#if defined( KOKKOS_HAVE_MPI ) +#include +#endif + +#define HYPRE_MFLOPS 0 +#if HYPRE_MFLOPS +#include "_hypre_struct_mv.h" +#endif + +/* RDF: Why is this include here? */ +#include "_hypre_struct_mv.h" + +#ifdef HYPRE_DEBUG +#include +#endif + +/* begin lobpcg */ + +#define NO_SOLVER -9198 + +#include + +#include "fortran_matrix.h" +#include "HYPRE_lobpcg.h" +#include "interpreter.h" +#include "multivector.h" +#include "HYPRE_MatvecFunctions.h" + +/* end lobpcg */ + +HYPRE_Int SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* period); + +HYPRE_Int AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix, + HYPRE_Real cx, + HYPRE_Real cy, + HYPRE_Real cz, + HYPRE_Real conx, + HYPRE_Real cony, + HYPRE_Real conz) ; + +HYPRE_Int AddValuesVector( hypre_StructGrid *gridvector, + hypre_StructVector *zvector, + HYPRE_Int *period, + HYPRE_Real value ) ; + +/*-------------------------------------------------------------------------- + * Test driver for structured matrix interface (structured storage) + *--------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------- + * Standard 7-point laplacian in 3D with grid and anisotropy determined + * as command line arguments. Do `driver -help' for usage info. + *----------------------------------------------------------------------*/ + +hypre_int +main( hypre_int argc, + char *argv[] ) +{ + HYPRE_Int arg_index; + HYPRE_Int print_usage; + HYPRE_Int nx, ny, nz; + HYPRE_Int P, Q, R; + HYPRE_Int bx, by, bz; + HYPRE_Int px, py, pz; + HYPRE_Real cx, cy, cz; + HYPRE_Real conx, cony, conz; + HYPRE_Int solver_id; + HYPRE_Int solver_type; + + /*HYPRE_Real dxyz[3];*/ + + HYPRE_Int A_num_ghost[6] = {0, 0, 0, 0, 0, 0}; + HYPRE_Int v_num_ghost[3] = {0,0,0}; + + HYPRE_StructMatrix A; + HYPRE_StructVector b; + HYPRE_StructVector x; + + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + HYPRE_Int num_iterations; + HYPRE_Int time_index; + HYPRE_Real final_res_norm; + HYPRE_Real cf_tol; + + HYPRE_Int num_procs, myid; + + HYPRE_Int p, q, r; + HYPRE_Int dim; + HYPRE_Int n_pre, n_post; + HYPRE_Int nblocks ; + HYPRE_Int skip; + HYPRE_Int sym; + HYPRE_Int rap; + HYPRE_Int relax; + HYPRE_Real jacobi_weight; + HYPRE_Int usr_jacobi_weight; + HYPRE_Int jump; + HYPRE_Int rep, reps; + + HYPRE_Int **iupper; + HYPRE_Int **ilower; + + HYPRE_Int istart[3]; + HYPRE_Int periodic[3]; + HYPRE_Int **offsets; + HYPRE_Int constant_coefficient = 0; + HYPRE_Int *stencil_entries; + HYPRE_Int stencil_size; + HYPRE_Int diag_rank; + hypre_Index diag_index; + + HYPRE_StructGrid grid; + HYPRE_StructGrid readgrid; + HYPRE_StructStencil stencil; + + HYPRE_Int i, s; + HYPRE_Int ix, iy, iz, ib; + + HYPRE_Int read_fromfile_param; + HYPRE_Int read_fromfile_index; + HYPRE_Int read_rhsfromfile_param; + HYPRE_Int read_rhsfromfile_index; + HYPRE_Int read_x0fromfile_param; + HYPRE_Int read_x0fromfile_index; + HYPRE_Int periodx0[3] = {0,0,0}; + HYPRE_Int *readperiodic; + HYPRE_Int sum; + HYPRE_Int inner; + + HYPRE_Int print_system = 0; + + /* begin lobpcg */ + + HYPRE_Int lobpcgFlag = 0; + HYPRE_Int lobpcgSeed = 0; + HYPRE_Int blockSize = 1; + HYPRE_Int verbosity = 1; + HYPRE_Int iterations; + HYPRE_Int maxIterations = 100; + HYPRE_Int checkOrtho = 0; + HYPRE_Int printLevel = 0; + HYPRE_Int pcgIterations = 0; + HYPRE_Int pcgMode = 0; + HYPRE_Real tol = 1e-6; + HYPRE_Real pcgTol = 1e-2; + HYPRE_Real nonOrthF; + + FILE* filePtr; + + mv_MultiVectorPtr eigenvectors = NULL; + mv_MultiVectorPtr constrains = NULL; + HYPRE_Real* eigenvalues = NULL; + + HYPRE_Real* residuals; + utilities_FortranMatrix* residualNorms; + utilities_FortranMatrix* residualNormsHistory; + utilities_FortranMatrix* eigenvaluesHistory; + utilities_FortranMatrix* printBuffer; + utilities_FortranMatrix* gramXX; + utilities_FortranMatrix* identity; + + HYPRE_StructSolver lobpcg_solver; + + mv_InterfaceInterpreter* interpreter; + HYPRE_MatvecFunctions matvec_fn; + /* end lobpcg */ + + /*----------------------------------------------------------- + * Initialize some stuff + *-----------------------------------------------------------*/ + + /* Initialize MPI */ + hypre_MPI_Init(&argc, &argv); + +#if defined(HYPRE_USE_KOKKOS) + Kokkos::InitArguments args; + args.num_threads = 12; + Kokkos::initialize (args); +#endif + + hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs ); + hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); + + +#ifdef HYPRE_DEBUG + cegdb(&argc, &argv, myid); +#endif + + hypre_InitMemoryDebug(myid); + + /*----------------------------------------------------------- + * Set defaults + *-----------------------------------------------------------*/ + + dim = 3; + + skip = 0; + sym = 1; + rap = 0; + relax = 1; + usr_jacobi_weight= 0; + jump = 0; + reps = 1; + + nx = 10; + ny = 10; + nz = 10; + + P = num_procs; + Q = 1; + R = 1; + + bx = 1; + by = 1; + bz = 1; + + cx = 1.0; + cy = 1.0; + cz = 1.0; + conx = 0.0; + cony = 0.0; + conz = 0.0; + + n_pre = 1; + n_post = 1; + + solver_id = 0; + solver_type = 1; + + istart[0] = -3; + istart[1] = -3; + istart[2] = -3; + + px = 0; + py = 0; + pz = 0; + + cf_tol = 0.90; + + /* setting defaults for the reading parameters */ + read_fromfile_param = 0; + read_fromfile_index = argc; + read_rhsfromfile_param = 0; + read_rhsfromfile_index = argc; + read_x0fromfile_param = 0; + read_x0fromfile_index = argc; + sum = 0; + + /* ghosts for the building of matrix: default */ + for (i = 0; i < dim; i++) + { + A_num_ghost[2*i] = 1; + A_num_ghost[2*i + 1] = 1; + } + + /*----------------------------------------------------------- + * Parse command line + *-----------------------------------------------------------*/ + + print_usage = 0; + arg_index = 1; + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + nx = atoi(argv[arg_index++]); + ny = atoi(argv[arg_index++]); + nz = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-istart") == 0 ) + { + arg_index++; + istart[0] = atoi(argv[arg_index++]); + istart[1] = atoi(argv[arg_index++]); + istart[2] = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-P") == 0 ) + { + arg_index++; + P = atoi(argv[arg_index++]); + Q = atoi(argv[arg_index++]); + R = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-b") == 0 ) + { + arg_index++; + bx = atoi(argv[arg_index++]); + by = atoi(argv[arg_index++]); + bz = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-p") == 0 ) + { + arg_index++; + px = atoi(argv[arg_index++]); + py = atoi(argv[arg_index++]); + pz = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-c") == 0 ) + { + arg_index++; + cx = atof(argv[arg_index++]); + cy = atof(argv[arg_index++]); + cz = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-convect") == 0 ) + { + arg_index++; + conx = atof(argv[arg_index++]); + cony = atof(argv[arg_index++]); + conz = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-d") == 0 ) + { + arg_index++; + dim = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-fromfile") == 0 ) + { + arg_index++; + read_fromfile_param = 1; + read_fromfile_index = arg_index; + } + else if ( strcmp(argv[arg_index], "-rhsfromfile") == 0 ) + { + arg_index++; + read_rhsfromfile_param = 1; + read_rhsfromfile_index = arg_index; + } + else if ( strcmp(argv[arg_index], "-x0fromfile") == 0 ) + { + arg_index++; + read_x0fromfile_param = 1; + read_x0fromfile_index = arg_index; + } + else if (strcmp(argv[arg_index], "-repeats") == 0 ) + { + arg_index++; + reps = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + + /* begin lobpcg */ + if ( strcmp(argv[arg_index], "none") == 0 ) { + solver_id = NO_SOLVER; + arg_index++; + } + else /* end lobpcg */ + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-v") == 0 ) + { + arg_index++; + n_pre = atoi(argv[arg_index++]); + n_post = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rap") == 0 ) + { + arg_index++; + rap = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-relax") == 0 ) + { + arg_index++; + relax = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-w") == 0 ) + { + arg_index++; + jacobi_weight= atof(argv[arg_index++]); + usr_jacobi_weight= 1; /* flag user weight */ + } + else if ( strcmp(argv[arg_index], "-sym") == 0 ) + { + arg_index++; + sym = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-skip") == 0 ) + { + arg_index++; + skip = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-jump") == 0 ) + { + arg_index++; + jump = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver_type") == 0 ) + { + arg_index++; + solver_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-cf") == 0 ) + { + arg_index++; + cf_tol = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-print") == 0 ) + { + arg_index++; + print_system = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + /* begin lobpcg */ + else if ( strcmp(argv[arg_index], "-lobpcg") == 0 ) + { /* use lobpcg */ + arg_index++; + lobpcgFlag = 1; + } + else if ( strcmp(argv[arg_index], "-orthchk") == 0 ) + { /* lobpcg: check orthonormality */ + arg_index++; + checkOrtho = 1; + } + else if ( strcmp(argv[arg_index], "-verb") == 0 ) + { /* lobpcg: verbosity level */ + arg_index++; + verbosity = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vrand") == 0 ) + { /* lobpcg: block size */ + arg_index++; + blockSize = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-seed") == 0 ) + { /* lobpcg: seed for srand */ + arg_index++; + lobpcgSeed = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-itr") == 0 ) + { /* lobpcg: max # of iterations */ + arg_index++; + maxIterations = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-tol") == 0 ) + { /* lobpcg: tolerance */ + arg_index++; + tol = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-pcgitr") == 0 ) + { /* lobpcg: max inner pcg iterations */ + arg_index++; + pcgIterations = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-pcgtol") == 0 ) + { /* lobpcg: inner pcg iterations tolerance */ + arg_index++; + pcgTol = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-pcgmode") == 0 ) + { /* lobpcg: initial guess for inner pcg */ + arg_index++; /* 0: zero, otherwise rhs */ + pcgMode = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vout") == 0 ) + { /* lobpcg: print level */ + arg_index++; + printLevel = atoi(argv[arg_index++]); + } + /* end lobpcg */ + else + { + arg_index++; + } + } + + /* begin lobpcg */ + + if ( solver_id == 0 && lobpcgFlag ) + solver_id = 10; + + /*end lobpcg */ + + sum = read_x0fromfile_param + read_rhsfromfile_param + read_fromfile_param; + + /*----------------------------------------------------------- + * Print usage info + *-----------------------------------------------------------*/ + + if ( (print_usage) && (myid == 0) ) + { + hypre_printf("\n"); + hypre_printf("Usage: %s []\n", argv[0]); + hypre_printf("\n"); + hypre_printf(" -n : problem size per block\n"); + hypre_printf(" -istart : start of box\n"); + hypre_printf(" -P : processor topology\n"); + hypre_printf(" -b : blocking per processor\n"); + hypre_printf(" -p : periodicity in each dimension\n"); + hypre_printf(" -c : diffusion coefficients\n"); + hypre_printf(" -convect : convection coefficients\n"); + hypre_printf(" -d : problem dimension (2 or 3)\n"); + hypre_printf(" -fromfile : prefix name for matrixfiles\n"); + hypre_printf(" -rhsfromfile : prefix name for rhsfiles\n"); + hypre_printf(" -x0fromfile : prefix name for firstguessfiles\n"); + hypre_printf(" -repeats : number of times to repeat the run, default 1.\n"); + hypre_printf(" -solver : solver ID\n"); + hypre_printf(" 0 - axpy\n"); + hypre_printf(" 1 - spMV\n"); + hypre_printf(" 2 - inner product\n"); + hypre_printf(" 3 - spMV with constant coeffs\n"); + hypre_printf(" 4 - spMV with constant coeffs var diag\n"); + hypre_printf(" 8 - Jacobi\n"); + hypre_printf(" -sym : symmetric storage (1) or not (0)\n"); + hypre_printf(" -jump : num levels to jump in SparseMSG\n"); + hypre_printf("\n"); + } + + if ( print_usage ) + { + exit(1); + } + + /*----------------------------------------------------------- + * Check a few things + *-----------------------------------------------------------*/ + + if ((P*Q*R) > num_procs) + { + if (myid == 0) + { + hypre_printf("Error: PxQxR is more than the number of processors\n"); + } + exit(1); + } + else if ((P*Q*R) < num_procs) + { + if (myid == 0) + { + hypre_printf("Warning: PxQxR is less than the number of processors\n"); + } + } + + if ((conx != 0.0 || cony !=0 || conz != 0) && sym == 1 ) + { + if (myid == 0) + { + hypre_printf("Warning: Convection produces non-symmetric matrix\n"); + } + sym = 0; + } + + /*----------------------------------------------------------- + * Print driver parameters + *-----------------------------------------------------------*/ + + if (myid == 0 && sum == 0) + { +#ifdef HYPRE_USE_DEFAULT + hypre_printf("Running with openMP macro\n"); +#endif +#ifdef HYPRE_USE_KOKKOS + hypre_printf("Running with Kokkos macro\n"); +#endif +#ifdef HYPRE_USE_CUDA + hypre_printf("Running with CUDA macro\n"); +#endif +#ifdef HYPRE_USE_RAJA + hypre_printf("Running with CUDA macro\n"); +#endif +#ifdef HYPRE_USE_KOKKOS_CUDA + hypre_printf("Running kokkos with CUDA macro\n"); +#endif + + hypre_printf("Running with these driver parameters:\n"); + hypre_printf(" (nx, ny, nz) = (%d, %d, %d)\n", nx, ny, nz); + hypre_printf(" (istart[0],istart[1],istart[2]) = (%d, %d, %d)\n", \ + istart[0],istart[1],istart[2]); + hypre_printf(" (Px, Py, Pz) = (%d, %d, %d)\n", P, Q, R); + hypre_printf(" (bx, by, bz) = (%d, %d, %d)\n", bx, by, bz); + hypre_printf(" (px, py, pz) = (%d, %d, %d)\n", px, py, pz); + hypre_printf(" (cx, cy, cz) = (%f, %f, %f)\n", cx, cy, cz); + hypre_printf(" (conx,cony,conz)= (%f, %f, %f)\n", conx, cony, conz); + hypre_printf(" (n_pre, n_post) = (%d, %d)\n", n_pre, n_post); + hypre_printf(" dim = %d\n", dim); + hypre_printf(" skip = %d\n", skip); + hypre_printf(" sym = %d\n", sym); + hypre_printf(" rap = %d\n", rap); + hypre_printf(" relax = %d\n", relax); + hypre_printf(" jump = %d\n", jump); + hypre_printf(" solver ID = %d\n", solver_id); + } + + if (myid == 0 && sum > 0) + { + hypre_printf("Running with these driver parameters:\n"); + hypre_printf(" (cx, cy, cz) = (%f, %f, %f)\n", cx, cy, cz); + hypre_printf(" (conx,cony,conz)= (%f, %f, %f)\n", conx, cony, conz); + hypre_printf(" (n_pre, n_post) = (%d, %d)\n", n_pre, n_post); + hypre_printf(" dim = %d\n", dim); + hypre_printf(" skip = %d\n", skip); + hypre_printf(" sym = %d\n", sym); + hypre_printf(" rap = %d\n", rap); + hypre_printf(" relax = %d\n", relax); + hypre_printf(" jump = %d\n", jump); + hypre_printf(" solver ID = %d\n", solver_id); + hypre_printf(" the grid is read from file \n"); + + } + + /*----------------------------------------------------------- + * Set up the stencil structure (7 points) when matrix is NOT read from file + * Set up the grid structure used when NO files are read + *-----------------------------------------------------------*/ + + switch (dim) + { + case 1: + nblocks = bx; + if(sym) + { + offsets = hypre_CTAlloc(HYPRE_Int*, 2); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 1); + offsets[0][0] = -1; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 1); + offsets[1][0] = 0; + } + else + { + offsets = hypre_CTAlloc(HYPRE_Int*, 3); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 1); + offsets[0][0] = -1; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 1); + offsets[1][0] = 0; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 1); + offsets[2][0] = 1; + } + /* compute p from P and myid */ + p = myid % P; + break; + + case 2: + nblocks = bx*by; + if(sym) + { + offsets = hypre_CTAlloc(HYPRE_Int*, 3); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[2][0] = 0; + offsets[2][1] = 0; + } + else + { + offsets = hypre_CTAlloc(HYPRE_Int*, 5); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[2][0] = 0; + offsets[2][1] = 0; + offsets[3] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[3][0] = 1; + offsets[3][1] = 0; + offsets[4] = hypre_CTAlloc(HYPRE_Int, 2); + offsets[4][0] = 0; + offsets[4][1] = 1; + } + /* compute p,q from P,Q and myid */ + p = myid % P; + q = (( myid - p)/P) % Q; + break; + + case 3: + nblocks = bx*by*bz; + if(sym) + { + offsets = hypre_CTAlloc(HYPRE_Int*, 4); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[0][2] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[1][2] = 0; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[2][0] = 0; + offsets[2][1] = 0; + offsets[2][2] = -1; + offsets[3] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[3][0] = 0; + offsets[3][1] = 0; + offsets[3][2] = 0; + } + else + { + offsets = hypre_CTAlloc(HYPRE_Int*, 7); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[0][2] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[1][2] = 0; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[2][0] = 0; + offsets[2][1] = 0; + offsets[2][2] = -1; + offsets[3] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[3][0] = 0; + offsets[3][1] = 0; + offsets[3][2] = 0; + offsets[4] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[4][0] = 1; + offsets[4][1] = 0; + offsets[4][2] = 0; + offsets[5] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[5][0] = 0; + offsets[5][1] = 1; + offsets[5][2] = 0; + offsets[6] = hypre_CTAlloc(HYPRE_Int, 3); + offsets[6][0] = 0; + offsets[6][1] = 0; + offsets[6][2] = 1; + } + /* compute p,q,r from P,Q,R and myid */ + p = myid % P; + q = (( myid - p)/P) % Q; + r = ( myid - p - P*q)/( P*Q ); + break; + } + + if (myid >= (P*Q*R)) + { + /* My processor has no data on it */ + nblocks = bx = by = bz = 0; + } + + /*----------------------------------------------------------- + * Set up the stencil structure needed for matrix creation + * which is always the case for read_fromfile_param == 0 + *-----------------------------------------------------------*/ + + HYPRE_StructStencilCreate(dim, (2-sym)*dim + 1, &stencil); + for (s = 0; s < (2-sym)*dim + 1; s++) + { + HYPRE_StructStencilSetElement(stencil, s, offsets[s]); + } + + /*----------------------------------------------------------- + * Set up periodic + *-----------------------------------------------------------*/ + + periodic[0] = px; + periodic[1] = py; + periodic[2] = pz; + + /*----------------------------------------------------------- + * Set up dxyz for PFMG solver + *-----------------------------------------------------------*/ + + /* We do the extreme cases first reading everything from files => sum = 3 + * building things from scratch (grid,stencils,extents) sum = 0 */ + + if ( (read_fromfile_param ==1) && + (read_x0fromfile_param ==1) && + (read_rhsfromfile_param ==1) + ) + { + hypre_printf("\nreading linear system from files: matrix, rhs and x0\n"); + /* ghost selection for reading the matrix and vectors */ + for (i = 0; i < dim; i++) + { + A_num_ghost[2*i] = 1; + A_num_ghost[2*i + 1] = 1; + v_num_ghost[2*i] = 1; + v_num_ghost[2*i + 1] = 1; + } + + A = (HYPRE_StructMatrix) + hypre_StructMatrixRead(hypre_MPI_COMM_WORLD, + argv[read_fromfile_index],A_num_ghost); + + b = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_rhsfromfile_index],v_num_ghost); + + x = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_x0fromfile_index],v_num_ghost); + } + + /* beginning of sum == 0 */ + if (sum == 0) /* no read from any file */ + { + /*----------------------------------------------------------- + * prepare space for the extents + *-----------------------------------------------------------*/ + + ilower = hypre_CTAlloc(HYPRE_Int*, nblocks); + iupper = hypre_CTAlloc(HYPRE_Int*, nblocks); + for (i = 0; i < nblocks; i++) + { + ilower[i] = hypre_CTAlloc(HYPRE_Int, dim); + iupper[i] = hypre_CTAlloc(HYPRE_Int, dim); + } + + /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */ + ib = 0; + switch (dim) + { + case 1: + for (ix = 0; ix < bx; ix++) + { + ilower[ib][0] = istart[0]+ nx*(bx*p+ix); + iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; + ib++; + } + break; + case 2: + for (iy = 0; iy < by; iy++) + for (ix = 0; ix < bx; ix++) + { + ilower[ib][0] = istart[0]+ nx*(bx*p+ix); + iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; + ilower[ib][1] = istart[1]+ ny*(by*q+iy); + iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1; + ib++; + } + break; + case 3: + for (iz = 0; iz < bz; iz++) + for (iy = 0; iy < by; iy++) + for (ix = 0; ix < bx; ix++) + { + ilower[ib][0] = istart[0]+ nx*(bx*p+ix); + iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; + ilower[ib][1] = istart[1]+ ny*(by*q+iy); + iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1; + ilower[ib][2] = istart[2]+ nz*(bz*r+iz); + iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1; + ib++; + } + break; + } + + HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid); + for (ib = 0; ib < nblocks; ib++) + { + /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/ + HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]); + } + HYPRE_StructGridSetPeriodic(grid, periodic); + HYPRE_StructGridAssemble(grid); + + /*----------------------------------------------------------- + * Set up the matrix structure + *-----------------------------------------------------------*/ + + for (i = 0; i < dim; i++) + { + A_num_ghost[2*i] = 1; + A_num_ghost[2*i + 1] = 1; + } + + HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, grid, stencil, &A); + if ( solver_id == 3 || solver_id == 4 || + solver_id == 13 || solver_id == 14 ) + { + stencil_size = hypre_StructStencilSize(stencil); + stencil_entries = hypre_CTAlloc(HYPRE_Int, stencil_size); + if ( solver_id == 3 || solver_id == 13) + { + for ( i=0; i=1 ); + if ( diag_rank==0 ) stencil_entries[diag_rank]=1; + else stencil_entries[diag_rank]=0; + for ( i=0; i 0 ) && (sum < 3)) + { + /* the grid will come from rhs or from x0 */ + if (read_fromfile_param == 0) + { + + if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param == 0)) + { + /* read right hand side, extract grid, construct matrix, + construct x0 */ + + hypre_printf("\ninitial rhs from file prefix :%s\n", + argv[read_rhsfromfile_index]); + + /* ghost selection for vector */ + for (i = 0; i < dim; i++) + { + v_num_ghost[2*i] = 1; + v_num_ghost[2*i + 1] = 1; + } + + b = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_rhsfromfile_index], + v_num_ghost); + + readgrid = hypre_StructVectorGrid(b) ; + readperiodic = hypre_StructGridPeriodic(readgrid); + + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &x); + HYPRE_StructVectorInitialize(x); + + AddValuesVector(readgrid,x,periodx0,0.0); + HYPRE_StructVectorAssemble(x); + + HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, + readgrid, stencil, &A); + HYPRE_StructMatrixSetSymmetric(A, 1); + HYPRE_StructMatrixSetNumGhost(A, A_num_ghost); + HYPRE_StructMatrixInitialize(A); + + /*----------------------------------------------------------- + * Fill in the matrix elements + *-----------------------------------------------------------*/ + + AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz); + + /* Zero out stencils reaching to real boundary */ + + if ( constant_coefficient==0 ) + SetStencilBndry(A,readgrid,readperiodic); + HYPRE_StructMatrixAssemble(A); + } + /* done with one case rhs=1 x0 = 0 */ + + /* case when rhs=0 and read x0=1 */ + if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param > 0)) + { + /* read right hand side, extract grid, construct matrix, + construct x0 */ + + hypre_printf("\ninitial x0 from file prefix :%s\n", + argv[read_x0fromfile_index]); + + /* ghost selection for vector */ + for (i = 0; i < dim; i++) + { + v_num_ghost[2*i] = 1; + v_num_ghost[2*i + 1] = 1; + } + + x = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_x0fromfile_index],v_num_ghost); + + readgrid = hypre_StructVectorGrid(x) ; + readperiodic = hypre_StructGridPeriodic(readgrid); + + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b); + HYPRE_StructVectorInitialize(b); + AddValuesVector(readgrid,b,readperiodic,1.0); + + HYPRE_StructVectorAssemble(b); + + HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, + readgrid, stencil, &A); + HYPRE_StructMatrixSetSymmetric(A, 1); + HYPRE_StructMatrixSetNumGhost(A, A_num_ghost); + HYPRE_StructMatrixInitialize(A); + + /*----------------------------------------------------------- + * Fill in the matrix elements + *-----------------------------------------------------------*/ + + AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz); + + /* Zero out stencils reaching to real boundary */ + + if ( constant_coefficient == 0 ) + SetStencilBndry(A,readgrid,readperiodic); + HYPRE_StructMatrixAssemble(A); + } + /* done with one case rhs=0 x0 = 1 */ + + /* the other case when read rhs > 0 and read x0 > 0 */ + if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param > 0)) + { + /* read right hand side, extract grid, construct matrix, + construct x0 */ + + hypre_printf("\ninitial rhs from file prefix :%s\n", + argv[read_rhsfromfile_index]); + hypre_printf("\ninitial x0 from file prefix :%s\n", + argv[read_x0fromfile_index]); + + /* ghost selection for vector */ + for (i = 0; i < dim; i++) + { + v_num_ghost[2*i] = 1; + v_num_ghost[2*i + 1] = 1; + } + + b = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_rhsfromfile_index], + v_num_ghost); + + x = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_x0fromfile_index], + v_num_ghost); + + readgrid= hypre_StructVectorGrid(b) ; + readperiodic = hypre_StructGridPeriodic(readgrid); + + HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, + readgrid, stencil, &A); + HYPRE_StructMatrixSetSymmetric(A, 1); + HYPRE_StructMatrixSetNumGhost(A, A_num_ghost); + HYPRE_StructMatrixInitialize(A); + + /*----------------------------------------------------------- + * Fill in the matrix elements + *-----------------------------------------------------------*/ + + AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz); + + /* Zero out stencils reaching to real boundary */ + + if ( constant_coefficient == 0 ) + SetStencilBndry(A,readgrid,readperiodic); + HYPRE_StructMatrixAssemble(A); + } + /* done with one case rhs=1 x0 = 1 */ + } + /* done with the case where you no read matrix */ + + if (read_fromfile_param == 1) /* still sum > 0 */ + { + hypre_printf("\nreading matrix from file:%s\n", + argv[read_fromfile_index]); + /* ghost selection for reading the matrix */ + for (i = 0; i < dim; i++) + { + A_num_ghost[2*i] = 1; + A_num_ghost[2*i + 1] = 1; + } + + A = (HYPRE_StructMatrix) + hypre_StructMatrixRead(hypre_MPI_COMM_WORLD, + argv[read_fromfile_index], A_num_ghost); + + readgrid = hypre_StructMatrixGrid(A); + readperiodic = hypre_StructGridPeriodic(readgrid); + + if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param == 0)) + { + /* read right hand side ,construct x0 */ + hypre_printf("\ninitial rhs from file prefix :%s\n", + argv[read_rhsfromfile_index]); + + /* ghost selection for vector */ + for (i = 0; i < dim; i++) + { + v_num_ghost[2*i] = 1; + v_num_ghost[2*i + 1] = 1; + } + + b = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_rhsfromfile_index], + v_num_ghost); + + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid,&x); + HYPRE_StructVectorInitialize(x); + AddValuesVector(readgrid,x,periodx0,0.0); + HYPRE_StructVectorAssemble(x); + } + + if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param > 0)) + { + /* read x0, construct rhs*/ + hypre_printf("\ninitial x0 from file prefix :%s\n", + argv[read_x0fromfile_index]); + + /* ghost selection for vector */ + for (i = 0; i < dim; i++) + { + v_num_ghost[2*i] = 1; + v_num_ghost[2*i + 1] = 1; + } + + x = (HYPRE_StructVector) + hypre_StructVectorRead(hypre_MPI_COMM_WORLD, + argv[read_x0fromfile_index], + v_num_ghost); + + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b); + HYPRE_StructVectorInitialize(b); + AddValuesVector(readgrid,b,readperiodic,1.0); + HYPRE_StructVectorAssemble(b); + } + + if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param == 0)) + { + /* construct x0 , construct b*/ + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b); + HYPRE_StructVectorInitialize(b); + AddValuesVector(readgrid,b,readperiodic,1.0); + HYPRE_StructVectorAssemble(b); + + + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &x); + HYPRE_StructVectorInitialize(x); + AddValuesVector(readgrid,x,periodx0,0.0); + HYPRE_StructVectorAssemble(x); + } + } + /* finish the read of matrix */ + } + /* finish the sum > 0 case */ + + /*----------------------------------------------------------- + * Print out the system and initial guess + *-----------------------------------------------------------*/ + + if (print_system) + { + HYPRE_StructMatrixPrint("struct.out.A", A, 0); + HYPRE_StructVectorPrint("struct.out.b", b, 0); + HYPRE_StructVectorPrint("struct.out.x0", x, 0); + } + + /*----------------------------------------------------------- + * axpy + *-----------------------------------------------------------*/ + +#if !HYPRE_MFLOPS + + hypre_MPI_Barrier(hypre_MPI_COMM_WORLD); + + if (solver_id == 0) + { + //timeval tstart,tstop; + time_index = hypre_InitializeTiming("axpy"); + hypre_BeginTiming(time_index); + //gettimeofday(&tstart,NULL); + + for ( rep=0; rep 9) && (solver_id < 20)) + { + } + + /*----------------------------------------------------------- + * Print the solution and other info + *-----------------------------------------------------------*/ + + if (print_system) + { + HYPRE_StructVectorPrint("struct.out.x", x, 0); + } +#endif + /*----------------------------------------------------------- + * Compute MFLOPs for Matvec + *-----------------------------------------------------------*/ + +#if HYPRE_MFLOPS + { + void *matvec_data; + HYPRE_Int i, imax, N; + + /* compute imax */ + N = (P*nx)*(Q*ny)*(R*nz); + imax = (5*1000000) / N; + + matvec_data = hypre_StructMatvecCreate(); + hypre_StructMatvecSetup(matvec_data, A, x); + + time_index = hypre_InitializeTiming("Matvec"); + hypre_BeginTiming(time_index); + + for (i = 0; i < imax; i++) + { + hypre_StructMatvecCompute(matvec_data, 1.0, A, x, 1.0, b); + } + /* this counts mult-adds */ + hypre_IncFLOPCount(7*N*imax); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Matvec time", hypre_MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + hypre_StructMatvecDestroy(matvec_data); + } +#endif + + /*----------------------------------------------------------- + * Finalize things + *-----------------------------------------------------------*/ + + HYPRE_StructStencilDestroy(stencil); + HYPRE_StructMatrixDestroy(A); + HYPRE_StructVectorDestroy(b); + HYPRE_StructVectorDestroy(x); + + for ( i = 0; i < (dim + 1); i++) + hypre_TFree(offsets[i]); + hypre_TFree(offsets); + + hypre_FinalizeMemoryDebug(); + + /* Finalize MPI */ + hypre_MPI_Finalize(); +#if defined(HYPRE_USE_KOKKOS) + Kokkos::finalize (); +#endif + return (0); +} + +/*------------------------------------------------------------------------- + * add constant values to a vector. Need to pass the initialized vector, grid, + * period of grid and the constant value. + *-------------------------------------------------------------------------*/ + +HYPRE_Int +AddValuesVector( hypre_StructGrid *gridvector, + hypre_StructVector *zvector, + HYPRE_Int *period, + HYPRE_Real value ) +{ +/* #include "_hypre_struct_mv.h" */ + HYPRE_Int ierr = 0; + hypre_BoxArray *gridboxes; + HYPRE_Int i,ib; + hypre_IndexRef ilower; + hypre_IndexRef iupper; + hypre_Box *box; + HYPRE_Real *values; + HYPRE_Int volume,dim; + + gridboxes = hypre_StructGridBoxes(gridvector); + dim = hypre_StructGridNDim(gridvector); + + ib=0; + hypre_ForBoxI(ib, gridboxes) + { + box = hypre_BoxArrayBox(gridboxes, ib); + volume = hypre_BoxVolume(box); + values = hypre_CTAlloc(HYPRE_Real, volume); + + /*----------------------------------------------------------- + * For periodic b.c. in all directions, need rhs to satisfy + * compatibility condition. Achieved by setting a source and + * sink of equal strength. All other problems have rhs = 1. + *-----------------------------------------------------------*/ + + if ((dim == 2 && period[0] != 0 && period[1] != 0) || + (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0)) + { + for (i = 0; i < volume; i++) + { + values[i] = 0.0; + } + values[0] = value; + values[volume - 1] = -value; + } + else + { + for (i = 0; i < volume; i++) + { + values[i] = value; + } + } + + ilower = hypre_BoxIMin(box); + iupper = hypre_BoxIMax(box); + HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values); + hypre_TFree(values); + + } + + return ierr; +} + +/****************************************************************************** + * Adds values to matrix based on a 7 point (3d) + * symmetric stencil for a convection-diffusion problem. + * It need an initialized matrix, an assembled grid, and the constants + * that determine the 7 point (3d) convection-diffusion. + ******************************************************************************/ + +HYPRE_Int +AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix, + HYPRE_Real cx, + HYPRE_Real cy, + HYPRE_Real cz, + HYPRE_Real conx, + HYPRE_Real cony, + HYPRE_Real conz) +{ + + HYPRE_Int ierr=0; + hypre_BoxArray *gridboxes; + HYPRE_Int i,s,bi; + hypre_IndexRef ilower; + hypre_IndexRef iupper; + hypre_Box *box; + HYPRE_Real *values; + HYPRE_Real east,west; + HYPRE_Real north,south; + HYPRE_Real top,bottom; + HYPRE_Real center; + HYPRE_Int volume,dim,sym; + HYPRE_Int *stencil_indices; + HYPRE_Int stencil_size; + HYPRE_Int constant_coefficient; + + gridboxes = hypre_StructGridBoxes(gridmatrix); + dim = hypre_StructGridNDim(gridmatrix); + sym = hypre_StructMatrixSymmetric(A); + constant_coefficient = hypre_StructMatrixConstantCoefficient(A); + + bi=0; + + east = -cx; + west = -cx; + north = -cy; + south = -cy; + top = -cz; + bottom = -cz; + center = 2.0*cx; + if (dim > 1) center += 2.0*cy; + if (dim > 2) center += 2.0*cz; + + stencil_size = 1 + (2 - sym) * dim; + stencil_indices = hypre_CTAlloc(HYPRE_Int, stencil_size); + for (s = 0; s < stencil_size; s++) + { + stencil_indices[s] = s; + } + + if(sym) + { + if ( constant_coefficient==0 ) + { + hypre_ForBoxI(bi, gridboxes) + { + box = hypre_BoxArrayBox(gridboxes, bi); + volume = hypre_BoxVolume(box); + values = hypre_CTAlloc(HYPRE_Real, stencil_size*volume); + + for (i = 0; i < stencil_size*volume; i += stencil_size) + { + switch (dim) + { + case 1: + values[i ] = west; + values[i+1] = center; + break; + case 2: + values[i ] = west; + values[i+1] = south; + values[i+2] = center; + break; + case 3: + values[i ] = west; + values[i+1] = south; + values[i+2] = bottom; + values[i+3] = center; + break; + } + } + ilower = hypre_BoxIMin(box); + iupper = hypre_BoxIMax(box); + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size, + stencil_indices, values); + hypre_TFree(values); + } + } + else if ( constant_coefficient==1 ) + { + values = hypre_CTAlloc(HYPRE_Real, stencil_size); + switch (dim) + { + case 1: + values[0] = west; + values[1] = center; + break; + case 2: + values[0] = west; + values[1] = south; + values[2] = center; + break; + case 3: + values[0] = west; + values[1] = south; + values[2] = bottom; + values[3] = center; + break; + } + if (hypre_BoxArraySize(gridboxes) > 0) + { + HYPRE_StructMatrixSetConstantValues(A, stencil_size, + stencil_indices, values); + } + hypre_TFree(values); + } + else + { + hypre_assert( constant_coefficient==2 ); + + /* stencil index for the center equals dim, so it's easy to leave out */ + values = hypre_CTAlloc(HYPRE_Real, stencil_size-1); + switch (dim) + { + case 1: + values[0] = west; + break; + case 2: + values[0] = west; + values[1] = south; + break; + case 3: + values[0] = west; + values[1] = south; + values[2] = bottom; + break; + } + if (hypre_BoxArraySize(gridboxes) > 0) + { + HYPRE_StructMatrixSetConstantValues(A, stencil_size-1, + stencil_indices, values); + } + hypre_TFree(values); + + hypre_ForBoxI(bi, gridboxes) + { + box = hypre_BoxArrayBox(gridboxes, bi); + volume = hypre_BoxVolume(box); + values = hypre_CTAlloc(HYPRE_Real, volume); + + for ( i=0; i < volume; ++i ) + { + values[i] = center; + } + ilower = hypre_BoxIMin(box); + iupper = hypre_BoxIMax(box); + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices+dim, values); + hypre_TFree(values); + } + } + } + else + { + if (conx > 0.0) + { + west -= conx; + center += conx; + } + else if (conx < 0.0) + { + east += conx; + center -= conx; + } + if (cony > 0.0) + { + south -= cony; + center += cony; + } + else if (cony < 0.0) + { + north += cony; + center -= cony; + } + if (conz > 0.0) + { + bottom -= conz; + center += conz; + } + else if (cony < 0.0) + { + top += conz; + center -= conz; + } + + if ( constant_coefficient==0 ) + { + hypre_ForBoxI(bi, gridboxes) + { + box = hypre_BoxArrayBox(gridboxes, bi); + volume = hypre_BoxVolume(box); + values = hypre_CTAlloc(HYPRE_Real, stencil_size*volume); + + for (i = 0; i < stencil_size*volume; i += stencil_size) + { + switch (dim) + { + case 1: + values[i ] = west; + values[i+1] = center; + values[i+2] = east; + break; + case 2: + values[i ] = west; + values[i+1] = south; + values[i+2] = center; + values[i+3] = east; + values[i+4] = north; + break; + case 3: + values[i ] = west; + values[i+1] = south; + values[i+2] = bottom; + values[i+3] = center; + values[i+4] = east; + values[i+5] = north; + values[i+6] = top; + break; + } + } + ilower = hypre_BoxIMin(box); + iupper = hypre_BoxIMax(box); + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size, + stencil_indices, values); + + hypre_TFree(values); + } + } + else if ( constant_coefficient==1 ) + { + values = hypre_CTAlloc( HYPRE_Real, stencil_size ); + + switch (dim) + { + case 1: + values[0] = west; + values[1] = center; + values[2] = east; + break; + case 2: + values[0] = west; + values[1] = south; + values[2] = center; + values[3] = east; + values[4] = north; + break; + case 3: + values[0] = west; + values[1] = south; + values[2] = bottom; + values[3] = center; + values[4] = east; + values[5] = north; + values[6] = top; + break; + } + + if (hypre_BoxArraySize(gridboxes) > 0) + { + HYPRE_StructMatrixSetConstantValues(A, stencil_size, + stencil_indices, values); + } + + hypre_TFree(values); + } + else + { + hypre_assert( constant_coefficient==2 ); + values = hypre_CTAlloc( HYPRE_Real, stencil_size-1 ); + switch (dim) + { /* no center in stencil_indices and values */ + case 1: + stencil_indices[0] = 0; + stencil_indices[1] = 2; + values[0] = west; + values[1] = east; + break; + case 2: + stencil_indices[0] = 0; + stencil_indices[1] = 1; + stencil_indices[2] = 3; + stencil_indices[3] = 4; + values[0] = west; + values[1] = south; + values[2] = east; + values[3] = north; + break; + case 3: + stencil_indices[0] = 0; + stencil_indices[1] = 1; + stencil_indices[2] = 2; + stencil_indices[3] = 4; + stencil_indices[4] = 5; + stencil_indices[5] = 6; + values[0] = west; + values[1] = south; + values[2] = bottom; + values[3] = east; + values[4] = north; + values[5] = top; + break; + } + + if (hypre_BoxArraySize(gridboxes) > 0) + { + HYPRE_StructMatrixSetConstantValues(A, stencil_size, + stencil_indices, values); + } + hypre_TFree(values); + + + /* center is variable */ + stencil_indices[0] = dim; /* refers to center */ + hypre_ForBoxI(bi, gridboxes) + { + box = hypre_BoxArrayBox(gridboxes, bi); + volume = hypre_BoxVolume(box); + values = hypre_CTAlloc(HYPRE_Real, volume); + + for ( i=0; i < volume; ++i ) + { + values[i] = center; + } + ilower = hypre_BoxIMin(box); + iupper = hypre_BoxIMax(box); + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + hypre_TFree(values); + } + } + } + + hypre_TFree(stencil_indices); + + return ierr; +} + +/********************************************************************************* + * this function sets to zero the stencil entries that are on the boundary + * Grid, matrix and the period are needed. + *********************************************************************************/ + +HYPRE_Int +SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* period) +{ + + HYPRE_Int ierr=0; + hypre_BoxArray *gridboxes; + HYPRE_Int size,i,j,d,ib; + HYPRE_Int **ilower; + HYPRE_Int **iupper; + HYPRE_Int *vol; + HYPRE_Int *istart, *iend; + hypre_Box *box; + hypre_Box *dummybox; + hypre_Box *boundingbox; + HYPRE_Real *values; + HYPRE_Int volume, dim; + HYPRE_Int *stencil_indices; + HYPRE_Int constant_coefficient; + + gridboxes = hypre_StructGridBoxes(gridmatrix); + boundingbox = hypre_StructGridBoundingBox(gridmatrix); + istart = hypre_BoxIMin(boundingbox); + iend = hypre_BoxIMax(boundingbox); + size = hypre_StructGridNumBoxes(gridmatrix); + dim = hypre_StructGridNDim(gridmatrix); + stencil_indices = hypre_CTAlloc(HYPRE_Int, 1); + + constant_coefficient = hypre_StructMatrixConstantCoefficient(A); + if ( constant_coefficient>0 ) return 1; + /*...no space dependence if constant_coefficient==1, + and space dependence only for diagonal if constant_coefficient==2 -- + and this function only touches off-diagonal entries */ + + vol = hypre_CTAlloc(HYPRE_Int, size); + ilower = hypre_CTAlloc(HYPRE_Int*, size); + iupper = hypre_CTAlloc(HYPRE_Int*, size); + for (i = 0; i < size; i++) + { + ilower[i] = hypre_CTAlloc(HYPRE_Int, dim); + iupper[i] = hypre_CTAlloc(HYPRE_Int, dim); + } + + i = 0; + ib = 0; + hypre_ForBoxI(i, gridboxes) + { + dummybox = hypre_BoxCreate(dim); + box = hypre_BoxArrayBox(gridboxes, i); + volume = hypre_BoxVolume(box); + vol[i] = volume; + hypre_CopyBox(box,dummybox); + for (d = 0; d < dim; d++) + { + ilower[ib][d] = hypre_BoxIMinD(dummybox,d); + iupper[ib][d] = hypre_BoxIMaxD(dummybox,d); + } + ib++ ; + hypre_BoxDestroy(dummybox); + } + + if ( constant_coefficient==0 ) + { + for (d = 0; d < dim; d++) + { + for (ib = 0; ib < size; ib++) + { + values = hypre_CTAlloc(HYPRE_Real, vol[ib]); + + for (i = 0; i < vol[ib]; i++) + { + values[i] = 0.0; + } + + if( ilower[ib][d] == istart[d] && period[d] == 0 ) + { + j = iupper[ib][d]; + iupper[ib][d] = istart[d]; + stencil_indices[0] = d; + HYPRE_StructMatrixSetBoxValues(A, ilower[ib], iupper[ib], + 1, stencil_indices, values); + iupper[ib][d] = j; + } + + if( iupper[ib][d] == iend[d] && period[d] == 0 ) + { + j = ilower[ib][d]; + ilower[ib][d] = iend[d]; + stencil_indices[0] = dim + 1 + d; + HYPRE_StructMatrixSetBoxValues(A, ilower[ib], iupper[ib], + 1, stencil_indices, values); + ilower[ib][d] = j; + } + hypre_TFree(values); + } + } + } + + hypre_TFree(vol); + hypre_TFree(stencil_indices); + for (ib =0 ; ib < size ; ib++) + { + hypre_TFree(ilower[ib]); + hypre_TFree(iupper[ib]); + } + hypre_TFree(ilower); + hypre_TFree(iupper); + + return ierr; +} diff -Nru hypre-2.11.2/src/test/TEST_examples/complex.jobs hypre-2.13.0/src/test/TEST_examples/complex.jobs --- hypre-2.11.2/src/test/TEST_examples/complex.jobs 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_examples/complex.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -15,5 +15,5 @@ # Run complex examples #============================================================================= -mpirun -np 16 ex18comp -n 4 > complex.out.1 +mpirun -np 16 ./ex18comp -n 4 > complex.out.1 diff -Nru hypre-2.11.2/src/test/TEST_examples/maxdim.jobs hypre-2.13.0/src/test/TEST_examples/maxdim.jobs --- hypre-2.11.2/src/test/TEST_examples/maxdim.jobs 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_examples/maxdim.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -15,6 +15,6 @@ # Run maxdim examples #============================================================================= -mpirun -np 16 ex17 -n 10 > maxdim.out.1 +mpirun -np 16 ./ex17 -n 10 > maxdim.out.1 -mpirun -np 16 ex18 -n 4 > maxdim.out.2 +mpirun -np 16 ./ex18 -n 4 > maxdim.out.2 diff -Nru hypre-2.11.2/src/test/TEST_ij/smoother.jobs hypre-2.13.0/src/test/TEST_ij/smoother.jobs --- hypre-2.11.2/src/test/TEST_ij/smoother.jobs 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_ij/smoother.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -36,6 +36,7 @@ # 14: Polynomial (Chebyshev 3rd order) on 4 procs # 15: FCF Jacobi on 4 procs # 16: CG smoother on 4 procs +# 17-20: Polynomial (Chebyshev 2nd order) with various options #============================================================================= mpirun -np 3 ./ij -rhsrand -n 15 30 10 -w 1.1 -owl 1.0 0 \ @@ -87,3 +88,21 @@ mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 15 -n 20 20 10 -P 2 2 1 \ > smoother.out.15 +mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -cheby_scale 0 -n 20 20 20 \ +-P 2 2 1 -27pt > smoother.out.16 + +mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -cheby_variant 1 -n 20 20 20 \ +-P 2 2 1 > smoother.out.17 + +mpirun -np 4 ./ij -solver 3 -rlx 16 -cheby_eig_est 0 -n 40 40 20 \ +-P 2 2 1 -difconv -a 10 10 10 > smoother.out.18 + +mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -rotate -alpha 60 -eps 0.1 -cheby_fraction 0.2 -n 200 200 \ +-P 2 2 > smoother.out.19 + +mpirun -np 4 ./ij -solver 1 -rlx 16 -cheby_eig_est 5 -n 40 40 20 \ +-P 2 2 1 -vardifconv -eps 0.1 > smoother.out.20 + + + + diff -Nru hypre-2.11.2/src/test/TEST_ij/smoother.saved hypre-2.13.0/src/test/TEST_ij/smoother.saved --- hypre-2.11.2/src/test/TEST_ij/smoother.saved 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_ij/smoother.saved 2017-10-20 17:42:22.000000000 +0000 @@ -74,3 +74,23 @@ Iterations = 15 Final Relative Residual Norm = 5.807749e-09 +# Output file: smoother.out.16 +Iterations = 6 +Final Relative Residual Norm = 1.555966e-09 + +# Output file: smoother.out.17 +Iterations = 7 +Final Relative Residual Norm = 2.088732e-09 + +# Output file: smoother.out.18 +GMRES Iterations = 11 +Final GMRES Relative Residual Norm = 8.192864e-09 + +# Output file: smoother.out.19 +Iterations = 6 +Final Relative Residual Norm = 8.887087e-10 + +# Output file: smoother.out.20 +Iterations = 11 +Final Relative Residual Norm = 3.089502e-09 + diff -Nru hypre-2.11.2/src/test/TEST_ij/smoother.sh hypre-2.13.0/src/test/TEST_ij/smoother.sh --- hypre-2.11.2/src/test/TEST_ij/smoother.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_ij/smoother.sh 2017-10-20 17:42:22.000000000 +0000 @@ -43,6 +43,11 @@ ${TNAME}.out.13\ ${TNAME}.out.14\ ${TNAME}.out.15\ + ${TNAME}.out.16\ + ${TNAME}.out.17\ + ${TNAME}.out.18\ + ${TNAME}.out.19\ + ${TNAME}.out.20\ " for i in $FILES diff -Nru hypre-2.11.2/src/test/TEST_ij/solvers.jobs hypre-2.13.0/src/test/TEST_ij/solvers.jobs --- hypre-2.11.2/src/test/TEST_ij/solvers.jobs 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_ij/solvers.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -82,6 +82,9 @@ mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120 #nonGalerkin version mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114 @@ -90,3 +93,30 @@ #RAP options mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116 mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117 + +# +# MGR and MGR-PCG +# +# coarse grid solver checks (1-level MGR == AMG (or coarse grid solver)) +# Also checks for keeping coarse nodes to coarsest level +# coarse grid size in output should be ~ mgr_num_reserved_nodes +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203 +# multi level MGR tests with different coarse grid type strategies +# Fix non C points to F points with different F-relaxation methods (single/multileve F-relaxation) +# with/ without reserved coarse nodes +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207 +# Not fixed non C points to F points with different F-relaxation methods (single/multileve F-relaxation) +# with/ without reserved coarse nodes +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211 +# MGR-PCG tests +mpirun -np 2 ./ij -solver 71 -mgr_nlevels 0 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212 +mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213 diff -Nru hypre-2.11.2/src/test/TEST_ij/solvers.saved hypre-2.13.0/src/test/TEST_ij/solvers.saved --- hypre-2.11.2/src/test/TEST_ij/solvers.saved 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_ij/solvers.saved 2017-10-20 17:42:22.000000000 +0000 @@ -37,22 +37,22 @@ Final Relative Residual Norm = 6.698760e-09 # Output file: solvers.out.9 -Iterations = 12 -PCG_Iterations = 8 +Iterations = 11 +PCG_Iterations = 7 DSCG_Iterations = 4 -Final Relative Residual Norm = 2.111630e-09 +Final Relative Residual Norm = 6.805037e-10 # Output file: solvers.out.10 -Iterations = 10 -PCG_Iterations = 8 +Iterations = 9 +PCG_Iterations = 7 DSCG_Iterations = 2 -Final Relative Residual Norm = 6.719344e-09 +Final Relative Residual Norm = 4.842561e-09 # Output file: solvers.out.11 Iterations = 7 PCG_Iterations = 4 DSCG_Iterations = 3 -Final Relative Residual Norm = 6.859548e-09 +Final Relative Residual Norm = 1.289994e-10 # Output file: solvers.out.sysh Average Convergence Factor = 0.122031 @@ -143,3 +143,71 @@ GMRES Iterations = 10 Final GMRES Relative Residual Norm = 1.006494e-09 +# Output file: solvers.out.118 +GMRES Iterations = 25 +Final GMRES Relative Residual Norm = 9.464475e-09 + +# Output file: solvers.out.119 +GMRES Iterations = 23 +Final GMRES Relative Residual Norm = 9.269998e-09 + +# Output file: solvers.out.120 +GMRES Iterations = 17 +Final GMRES Relative Residual Norm = 3.995718e-09 + +# Output file: solvers.out.200 +MGR Iterations = 6 +Final Relative Residual Norm = 6.980278e-10 + +# Output file: solvers.out.201 +MGR Iterations = 6 +Final Relative Residual Norm = 5.136059e-09 + +# Output file: solvers.out.202 +MGR Iterations = 6 +Final Relative Residual Norm = 6.980278e-10 + +# Output file: solvers.out.203 +MGR Iterations = 6 +Final Relative Residual Norm = 5.136059e-09 + +# Output file: solvers.out.204 +MGR Iterations = 74 +Final Relative Residual Norm = 8.553484e-09 + +# Output file: solvers.out.205 +MGR Iterations = 70 +Final Relative Residual Norm = 9.518056e-09 + +# Output file: solvers.out.206 +MGR Iterations = 9 +Final Relative Residual Norm = 3.330705e-09 + +# Output file: solvers.out.207 +MGR Iterations = 9 +Final Relative Residual Norm = 6.199197e-09 + +# Output file: solvers.out.208 +MGR Iterations = 57 +Final Relative Residual Norm = 7.786878e-09 + +# Output file: solvers.out.209 +MGR Iterations = 54 +Final Relative Residual Norm = 7.641495e-09 + +# Output file: solvers.out.210 +MGR Iterations = 9 +Final Relative Residual Norm = 1.103348e-09 + +# Output file: solvers.out.211 +MGR Iterations = 9 +Final Relative Residual Norm = 2.211486e-09 + +# Output file: solvers.out.212 +Iterations = 5 +Final Relative Residual Norm = 1.371340e-09 + +# Output file: solvers.out.213 +Iterations = 29 +Final Relative Residual Norm = 5.204677e-09 + diff -Nru hypre-2.11.2/src/test/TEST_ij/solvers.sh hypre-2.13.0/src/test/TEST_ij/solvers.sh --- hypre-2.11.2/src/test/TEST_ij/solvers.sh 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_ij/solvers.sh 2017-10-20 17:42:22.000000000 +0000 @@ -33,6 +33,18 @@ diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2 #============================================================================= +# IJ: MGR case nlevels < 1 and bsize < 2 should be the same +# compare results +#============================================================================= + +tail -17 ${TNAME}.out.200 | head -6 > ${TNAME}.mgr_testdata + +#============================================================================= + +tail -17 ${TNAME}.out.202 | head -6 > ${TNAME}.mgr_testdata.temp +diff ${TNAME}.mgr_testdata ${TNAME}.mgr_testdata.temp >&2 + +#============================================================================= # compare with baseline case #============================================================================= @@ -96,6 +108,32 @@ ${TNAME}.out.115\ ${TNAME}.out.116\ ${TNAME}.out.117\ + ${TNAME}.out.118\ + ${TNAME}.out.119\ + ${TNAME}.out.120\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done >> ${TNAME}.out + +FILES="\ + ${TNAME}.out.200\ + ${TNAME}.out.201\ + ${TNAME}.out.202\ + ${TNAME}.out.203\ + ${TNAME}.out.204\ + ${TNAME}.out.205\ + ${TNAME}.out.206\ + ${TNAME}.out.207\ + ${TNAME}.out.208\ + ${TNAME}.out.209\ + ${TNAME}.out.210\ + ${TNAME}.out.211\ + ${TNAME}.out.212\ + ${TNAME}.out.213\ " for i in $FILES @@ -121,3 +159,4 @@ #============================================================================= rm -f ${TNAME}.testdata* +rm -r ${TNAME}.mgr_testdata* diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.jobs hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.jobs --- hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.jobs 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,92 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + + + + + + + +#============================================================================= +# ij: Run default case with different solvers +# 1: BoomerAMG_PCG +# 2: DS_PCG +# 3: BoomerAMG_GMRES +# 4: DS_GMRES +# 5: BoomerAMG_CGNR +# 6: DS_CGNR +# 7: PILUT_GMRES +# 8: ParaSails_PCG +# 20: Hybrid_PCG +# + + + +# ij: test systems AMG +# unknown approach +# hybrid approach with block smoother +# nodal approach +# more solvers: +# 51: BoomerAMG_LGMRES +# 50: DS_LGMRES +# 61: BoomerAMG_FlexGMRES +# 60: DS_FlexGMRES +# +#============================================================================= + +mpirun -np 2 ./ij -solver 1 -tol 1.e-16 -rhsrand > solvers_ij.out.0 +mpirun -np 2 ./ij -solver 2 -tol 1.e-16 -rhsrand > solvers_ij.out.1 +mpirun -np 2 ./ij -solver 3 -tol 1.e-16 -rhsrand > solvers_ij.out.2 +mpirun -np 2 ./ij -solver 4 -tol 1.e-16 -rhsrand > solvers_ij.out.3 +mpirun -np 2 ./ij -solver 5 -tol 1.e-16 -rhsrand -w 0.67 -ns 2 > solvers_ij.out.4 +mpirun -np 2 ./ij -solver 6 -tol 1.e-16 -rhsrand > solvers_ij.out.5 +mpirun -np 2 ./ij -solver 7 -tol 1.e-16 -rhsrand > solvers_ij.out.6 +mpirun -np 2 ./ij -solver 8 -tol 1.e-16 -rhsrand > solvers_ij.out.7 +mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -rhsrand > solvers_ij.out.8 +mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand > solvers_ij.out.9 +mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand -solver_type 2 > solvers_ij.out.10 +mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand -solver_type 3 > solvers_ij.out.11 + +#systems AMG run ...unknown approach, hybrid approach, nodal approach +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -tol 1.e-16 > solvers_ij.out.sysu +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 -tol 1.e-16 > solvers_ij.out.sysh +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 -tol 1.e-16 > solvers_ij.out.sysn + +#LGMRS and FlexGMRES +mpirun -np 2 ./ij -solver 50 -tol 1.e-16 -rhsrand > solvers_ij.out.101 +mpirun -np 2 ./ij -solver 51 -tol 1.e-16 -rhsrand > solvers_ij.out.102 +mpirun -np 2 ./ij -solver 60 -tol 1.e-16 -rhsrand > solvers_ij.out.103 +mpirun -np 2 ./ij -solver 61 -tol 1.e-16 -rhsrand > solvers_ij.out.104 + +#agglomerated coarse grid solve +mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -tol 1.e-16 > solvers_ij.out.105 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -tol 1.e-16 > solvers_ij.out.107 + +#redundant coarse grid solve +mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 -tol 1.e-16 > solvers_ij.out.106 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 -tol 1.e-16 > solvers_ij.out.108 + +#additive cycles +mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 -tol 1.e-16 > solvers_ij.out.109 +mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 -tol 1.e-16 > solvers_ij.out.110 +mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 -tol 1.e-16 > solvers_ij.out.111 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 -tol 1.e-16 > solvers_ij.out.112 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -tol 1.e-16 > solvers_ij.out.113 + +#nonGalerkin version +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 -tol 1.e-16 > solvers_ij.out.114 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 -tol 1.e-16 > solvers_ij.out.115 + +#RAP options +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 -tol 1.e-16 > solvers_ij.out.116 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 -tol 1.e-16 > solvers_ij.out.117 diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.saved hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.saved --- hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.saved 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.saved 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,145 @@ +# Output file: solvers_ij.out.0 +Iterations = 14 +Final Relative Residual Norm = 7.837531e-18 + +# Output file: solvers_ij.out.1 +Iterations = 65 +Final Relative Residual Norm = 5.127733e-17 + +# Output file: solvers_ij.out.2 +GMRES Iterations = 14 +Final GMRES Relative Residual Norm = 2.081070e-17 + +# Output file: solvers_ij.out.3 +GMRES Iterations = 199 +Final GMRES Relative Residual Norm = 9.121629e-17 + +# Output file: solvers_ij.out.4 +Iterations = 18 +Final Relative Residual Norm = 2.567047e-17 + +# Output file: solvers_ij.out.5 +Iterations = 375 +Final Relative Residual Norm = 9.100420e-17 + +# Output file: solvers_ij.out.6 +GMRES Iterations = 65 +Final GMRES Relative Residual Norm = 9.781966e-17 + +# Output file: solvers_ij.out.7 +Iterations = 42 +Final Relative Residual Norm = 4.868667e-17 + +# Output file: solvers_ij.out.8 +Iterations = 65 +PCG_Iterations = 0 +DSCG_Iterations = 65 +Final Relative Residual Norm = 5.127733e-17 + +# Output file: solvers_ij.out.9 +Iterations = 17 +PCG_Iterations = 13 +DSCG_Iterations = 4 +Final Relative Residual Norm = 2.361559e-17 + +# Output file: solvers_ij.out.10 +Iterations = 16 +PCG_Iterations = 14 +DSCG_Iterations = 2 +Final Relative Residual Norm = 2.081070e-17 + +# Output file: solvers_ij.out.11 +Iterations = 10 +PCG_Iterations = 7 +DSCG_Iterations = 3 +Final Relative Residual Norm = 1.094045e-17 + +# Output file: solvers_ij.out.sysh + Average Convergence Factor = 0.122527 + + Complexity: grid = 1.613750 + operator = 2.860373 + cycle = 5.720578 + +# Output file: solvers_ij.out.sysn + Average Convergence Factor = 0.222219 + + Complexity: grid = 1.592000 + operator = 2.633619 + cycle = 11.267164 + +# Output file: solvers_ij.out.sysu + Average Convergence Factor = 0.430735 + + Complexity: grid = 1.614937 + operator = 2.866488 + cycle = 5.732598 + +# Output file: solvers_ij.out.101 +LGMRES Iterations = 175 +Final LGMRES Relative Residual Norm = 9.264825e-17 + +# Output file: solvers_ij.out.102 +LGMRES Iterations = 17 +Final LGMRES Relative Residual Norm = 2.721454e-17 + +# Output file: solvers_ij.out.103 +FlexGMRES Iterations = 199 +Final FlexGMRES Relative Residual Norm = 9.121063e-17 + +# Output file: solvers_ij.out.104 +FlexGMRES Iterations = 14 +Final FlexGMRES Relative Residual Norm = 2.081457e-17 + +# Output file: solvers_ij.out.105 +Iterations = 27 +Final Relative Residual Norm = 4.195866e-17 + +# Output file: solvers_ij.out.106 +Iterations = 27 +Final Relative Residual Norm = 4.195866e-17 + +# Output file: solvers_ij.out.107 +Iterations = 41 +Final Relative Residual Norm = 5.542245e-17 + +# Output file: solvers_ij.out.108 +Iterations = 41 +Final Relative Residual Norm = 5.542245e-17 + +# Output file: solvers_ij.out.109 +Iterations = 32 +Final Relative Residual Norm = 8.814032e-17 + +# Output file: solvers_ij.out.110 +Iterations = 32 +Final Relative Residual Norm = 8.814032e-17 + +# Output file: solvers_ij.out.111 +Iterations = 56 +Final Relative Residual Norm = 8.514971e-17 + +# Output file: solvers_ij.out.112 +GMRES Iterations = 43 +Final GMRES Relative Residual Norm = 8.418244e-17 + +# Output file: solvers_ij.out.113 +GMRES Iterations = 50 +Final GMRES Relative Residual Norm = 7.078678e-17 + +# Output file: solvers_ij.out.114 +BoomerAMG Iterations = 33 +Final Relative Residual Norm = 3.984250e-17 + +# Output file: solvers_ij.out.115 +BoomerAMG Iterations = 33 +Final Relative Residual Norm = 8.096418e-17 + +# Output file: solvers_ij.out.116 +GMRES Iterations = 18 +Final GMRES Relative Residual Norm = 3.003469e-17 + +# Output file: solvers_ij.out.117 +GMRES Iterations = 18 +Final GMRES Relative Residual Norm = 3.024076e-17 + diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.sh hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.sh --- hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,123 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +TNAME=`basename $0 .sh` +CONVTOL=$1 + +# Set default check tolerance +if [ x$CONVTOL = "x" ]; +then + CONVTOL=0.0 +fi +#echo "tol = $CONVTOL" +#============================================================================= +# IJ: Run multiplicative and mult_additive cycle and compare results +# should be the same +#============================================================================= + +tail -17 ${TNAME}.out.109 | head -6 > ${TNAME}.testdata + +#============================================================================= + +tail -17 ${TNAME}.out.110 | head -6 > ${TNAME}.testdata.temp +diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2 + +#============================================================================= +# compare with baseline case +#============================================================================= + +FILES="\ + ${TNAME}.out.0\ + ${TNAME}.out.1\ + ${TNAME}.out.2\ + ${TNAME}.out.3\ + ${TNAME}.out.4\ + ${TNAME}.out.5\ + ${TNAME}.out.6\ + ${TNAME}.out.7\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done > ${TNAME}.out + +FILES="\ + ${TNAME}.out.8\ + ${TNAME}.out.9\ + ${TNAME}.out.10\ + ${TNAME}.out.11\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -5 $i +done >> ${TNAME}.out + +FILES="\ + ${TNAME}.out.sysh\ + ${TNAME}.out.sysn\ + ${TNAME}.out.sysu\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -21 $i | head -6 +done >> ${TNAME}.out + +FILES="\ + ${TNAME}.out.101\ + ${TNAME}.out.102\ + ${TNAME}.out.103\ + ${TNAME}.out.104\ + ${TNAME}.out.105\ + ${TNAME}.out.106\ + ${TNAME}.out.107\ + ${TNAME}.out.108\ + ${TNAME}.out.109\ + ${TNAME}.out.110\ + ${TNAME}.out.111\ + ${TNAME}.out.112\ + ${TNAME}.out.113\ + ${TNAME}.out.114\ + ${TNAME}.out.115\ + ${TNAME}.out.116\ + ${TNAME}.out.117\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done >> ${TNAME}.out + +# Make sure that the output files are reasonable +CHECK_LINE="Complexity" +OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l` +SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l` +if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then + echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2 +fi + +if [ -z $HYPRE_NO_SAVED ]; then + (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2 +fi + +#============================================================================= +# remove temporary files +#============================================================================= + +rm -f ${TNAME}.testdata* diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.jobs hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.jobs --- hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.jobs 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,63 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + + + + + + + +#============================================================================= +# struct: Run SMG-CG, PFMG-CG, DSCG, CG, and Hybrid. +#============================================================================= + +mpirun -np 3 ./struct -P 1 1 3 -solver 10 -tol 1.0e-16 > solvers_struct.out.0 +mpirun -np 3 ./struct -P 1 3 1 -solver 11 -tol 1.0e-16 > solvers_struct.out.1 +mpirun -np 3 ./struct -P 3 1 1 -solver 17 -tol 1.0e-16 > solvers_struct.out.2 +mpirun -np 1 ./struct -P 1 1 1 -solver 18 -tol 1.0e-16 > solvers_struct.out.3 +mpirun -np 1 ./struct -P 1 1 1 -solver 19 -tol 1.0e-16 > solvers_struct.out.4 + + +#============================================================================= +# Run default case with all available PCG preconditioners (solvers): +# 10: SMG (default) +# 11: PFMG +# 17: 2-step Jacobi +# 18: Diagonal scaling +# 19: none +#============================================================================= + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs +mpirun -np 2 ./struct -solver 10 -tol 1.e-16 > solvers_struct.out.10.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-16 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.10.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-16 -pcgitr 0 -seed 1 -vrand 5 > solvers_struct.out.10.lobpcg.5 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs +mpirun -np 2 ./struct -solver 11 -tol 1.e-16 > solvers_struct.out.11.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-16 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.11.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-16 -pcgitr 0 -seed 1 -vrand 5 > solvers_struct.out.11.lobpcg.5 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs +mpirun -np 2 ./struct -solver 17 -tol 1.e-16 > solvers_struct.out.17.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-16 -pcgitr 10 -seed 1 -vrand 1 > solvers_struct.out.17.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-16 -pcgitr 10 -seed 1 -vrand 5 > solvers_struct.out.17.lobpcg.5 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs +mpirun -np 2 ./struct -solver 18 -tol 1.e-16 > solvers_struct.out.18.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-16 -pcgitr 10 -seed 1 -vrand 1 > solvers_struct.out.18.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-16 -pcgitr 10 -seed 1 -vrand 5 > solvers_struct.out.18.lobpcg.5 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs +mpirun -np 2 ./struct -solver 19 1.e-16 > solvers_struct.out.19.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-16 -pcgitr 10 -seed 1 -vrand 1 > solvers_struct.out.19.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-16 -pcgitr 10 -seed 1 -vrand 5 > solvers_struct.out.19.lobpcg.5 diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.saved hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.saved --- hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.saved 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.saved 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,120 @@ +# Output file: solvers_struct.out.0 +Iterations = 10 +Final Relative Residual Norm = 6.581106e-18 + +# Output file: solvers_struct.out.1 +Iterations = 21 +Final Relative Residual Norm = 1.596699e-17 + +# Output file: solvers_struct.out.2 +Iterations = 38 +Final Relative Residual Norm = 7.177049e-17 + +# Output file: solvers_struct.out.3 +Iterations = 34 +Final Relative Residual Norm = 3.265691e-18 + +# Output file: solvers_struct.out.4 +Iterations = 34 +Final Relative Residual Norm = 3.219688e-18 + +# Output file: solvers_struct.out.10.lobpcg +Iterations = 9 +Final Relative Residual Norm = 8.224720e-17 + +# Output file: solvers_struct.out.10.lobpcg.1 +Eigenvalue lambda 1.84366453091756e-01 +Residual 6.51777159501743e-08 + +# Output file: solvers_struct.out.10.lobpcg.5 +Eigenvalue lambda 1.84366453091754e-01 +Eigenvalue lambda 2.50882493969729e-01 +Eigenvalue lambda 3.60090369737174e-01 +Eigenvalue lambda 4.20845334658399e-01 +Eigenvalue lambda 4.20845334658517e-01 +Residual 3.01898591710117e-08 +Residual 4.55422767371506e-08 +Residual 3.17971068849731e-07 +Residual 1.36018481189308e-07 +Residual 3.39522268990133e-07 + +# Output file: solvers_struct.out.11.lobpcg +Iterations = 20 +Final Relative Residual Norm = 4.957736e-17 + +# Output file: solvers_struct.out.11.lobpcg.1 +Eigenvalue lambda 1.84366453091756e-01 +Residual 5.90808378149077e-08 + +# Output file: solvers_struct.out.11.lobpcg.5 +Eigenvalue lambda 1.84366453091753e-01 +Eigenvalue lambda 2.50882493969729e-01 +Eigenvalue lambda 3.60090369737179e-01 +Eigenvalue lambda 4.20845334658427e-01 +Eigenvalue lambda 4.20845334658489e-01 +Residual 3.21720429474309e-08 +Residual 6.44224703495813e-08 +Residual 2.42812454193691e-07 +Residual 3.37502149337777e-07 +Residual 3.28950848883713e-07 + +# Output file: solvers_struct.out.17.lobpcg +Iterations = 34 +Final Relative Residual Norm = 4.087402e-17 + +# Output file: solvers_struct.out.17.lobpcg.1 +Eigenvalue lambda 1.84366453091770e-01 +Residual 1.03034775753859e-07 + +# Output file: solvers_struct.out.17.lobpcg.5 +Eigenvalue lambda 1.84366453091753e-01 +Eigenvalue lambda 2.50882493969729e-01 +Eigenvalue lambda 3.60090369737173e-01 +Eigenvalue lambda 4.20845334658410e-01 +Eigenvalue lambda 4.20845334658418e-01 +Residual 2.01876274958806e-08 +Residual 2.36598865456373e-07 +Residual 1.80211166022567e-07 +Residual 1.64172205059144e-07 +Residual 1.88447274083737e-07 + +# Output file: solvers_struct.out.18.lobpcg +Iterations = 59 +Final Relative Residual Norm = 5.038238e-17 + +# Output file: solvers_struct.out.18.lobpcg.1 +Eigenvalue lambda 1.84366453091755e-01 +Residual 8.10727379998635e-08 + +# Output file: solvers_struct.out.18.lobpcg.5 +Eigenvalue lambda 1.84366453091753e-01 +Eigenvalue lambda 2.50882493969729e-01 +Eigenvalue lambda 3.60090369737172e-01 +Eigenvalue lambda 4.20845334658451e-01 +Eigenvalue lambda 4.20845334658495e-01 +Residual 9.97341733992902e-08 +Residual 1.68228046062857e-07 +Residual 1.17106600356615e-07 +Residual 3.61346668971189e-07 +Residual 3.12170077450817e-07 + +# Output file: solvers_struct.out.19.lobpcg +Iterations = 32 +Final Relative Residual Norm = 8.259590e-07 + +# Output file: solvers_struct.out.19.lobpcg.1 +Eigenvalue lambda 1.84366453091755e-01 +Residual 8.10727379998105e-08 + +# Output file: solvers_struct.out.19.lobpcg.5 +Eigenvalue lambda 1.84366453091753e-01 +Eigenvalue lambda 2.50882493969730e-01 +Eigenvalue lambda 3.60090369737172e-01 +Eigenvalue lambda 4.20845334658451e-01 +Eigenvalue lambda 4.20845334658495e-01 +Residual 9.97341733991334e-08 +Residual 1.68228046056442e-07 +Residual 1.17106600356972e-07 +Residual 3.61346507824777e-07 +Residual 3.12150570425402e-07 + diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.sh hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.sh --- hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,79 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + + + + + +TNAME=`basename $0 .sh` +CONVTOL=$1 + +# Set default check tolerance +if [ x$CONVTOL = "x" ]; +then + CONVTOL=0.0 +fi +#echo "tol = $CONVTOL" +#============================================================================= +# compare with baseline case +#============================================================================= + +FILES="\ + ${TNAME}.out.0\ + ${TNAME}.out.1\ + ${TNAME}.out.2\ + ${TNAME}.out.3\ + ${TNAME}.out.4\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done > ${TNAME}.out + +FILES="\ + ${TNAME}.out.10.lobpcg\ + ${TNAME}.out.11.lobpcg\ + ${TNAME}.out.17.lobpcg\ + ${TNAME}.out.18.lobpcg\ + ${TNAME}.out.19.lobpcg\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i + echo "# Output file: $i.1" + tail -13 $i.1 | head -3 + echo "# Output file: $i.5" + tail -21 $i.5 | head -11 +done >> ${TNAME}.out + +# Make sure that the output files are reasonable +CHECK_LINE="Iterations" +OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l` +SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l` +if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then + echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2 +fi + +if [ -z $HYPRE_NO_SAVED ]; then + (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2 +fi + +#============================================================================= +# remove temporary files +#============================================================================= + +# rm -f ${TNAME}.testdata* diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_ij.jobs hypre-2.13.0/src/test/TEST_single/solvers_ij.jobs --- hypre-2.11.2/src/test/TEST_single/solvers_ij.jobs 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_single/solvers_ij.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,92 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + + + + + + + +#============================================================================= +# ij: Run default case with different solvers +# 1: BoomerAMG_PCG +# 2: DS_PCG +# 3: BoomerAMG_GMRES +# 4: DS_GMRES +# 5: BoomerAMG_CGNR +# 6: DS_CGNR +# 7: PILUT_GMRES +# 8: ParaSails_PCG +# 20: Hybrid_PCG +# + + + +# ij: test systems AMG +# unknown approach +# hybrid approach with block smoother +# nodal approach +# more solvers: +# 51: BoomerAMG_LGMRES +# 50: DS_LGMRES +# 61: BoomerAMG_FlexGMRES +# 60: DS_FlexGMRES +# +#============================================================================= + +mpirun -np 2 ./ij -solver 1 -tol 1.e-4 -rhsrand > solvers_ij.out.0 +mpirun -np 2 ./ij -solver 2 -tol 1.e-4 -rhsrand > solvers_ij.out.1 +mpirun -np 2 ./ij -solver 3 -tol 1.e-4 -rhsrand > solvers_ij.out.2 +mpirun -np 2 ./ij -solver 4 -tol 1.e-4 -rhsrand > solvers_ij.out.3 +mpirun -np 2 ./ij -solver 5 -tol 1.e-4 -rhsrand -w 0.67 -ns 2 > solvers_ij.out.4 +mpirun -np 2 ./ij -solver 6 -tol 1.e-4 -rhsrand > solvers_ij.out.5 +mpirun -np 2 ./ij -solver 7 -tol 1.e-4 -rhsrand > solvers_ij.out.6 +mpirun -np 2 ./ij -solver 8 -tol 1.e-4 -rhsrand > solvers_ij.out.7 +mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -rhsrand > solvers_ij.out.8 +mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand > solvers_ij.out.9 +mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand -solver_type 2 > solvers_ij.out.10 +mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand -solver_type 3 > solvers_ij.out.11 + +#systems AMG run ...unknown approach, hybrid approach, nodal approach +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -tol 1.e-4 > solvers_ij.out.sysu +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 -tol 1.e-4 > solvers_ij.out.sysh +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 -tol 1.e-4 > solvers_ij.out.sysn + +#LGMRS and FlexGMRES +mpirun -np 2 ./ij -solver 50 -tol 1.e-4 -rhsrand > solvers_ij.out.101 +mpirun -np 2 ./ij -solver 51 -tol 1.e-4 -rhsrand > solvers_ij.out.102 +mpirun -np 2 ./ij -solver 60 -tol 1.e-4 -rhsrand > solvers_ij.out.103 +mpirun -np 2 ./ij -solver 61 -tol 1.e-4 -rhsrand > solvers_ij.out.104 + +#agglomerated coarse grid solve +mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -tol 1.e-4 > solvers_ij.out.105 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -tol 1.e-4 > solvers_ij.out.107 + +#redundant coarse grid solve +mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 -tol 1.e-4 > solvers_ij.out.106 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 -tol 1.e-4 > solvers_ij.out.108 + +#additive cycles +mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 -tol 1.e-4 > solvers_ij.out.109 +mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 -tol 1.e-4 > solvers_ij.out.110 +mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 -tol 1.e-4 > solvers_ij.out.111 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 -tol 1.e-4 > solvers_ij.out.112 +mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -tol 1.e-4 > solvers_ij.out.113 + +#nonGalerkin version +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 -tol 1.e-4 > solvers_ij.out.114 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 -tol 1.e-4 > solvers_ij.out.115 + +#RAP options +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 -tol 1.e-4 > solvers_ij.out.116 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 -tol 1.e-4 > solvers_ij.out.117 diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_ij.saved hypre-2.13.0/src/test/TEST_single/solvers_ij.saved --- hypre-2.11.2/src/test/TEST_single/solvers_ij.saved 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_single/solvers_ij.saved 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,145 @@ +# Output file: solvers_ij.out.0 +Iterations = 4 +Final Relative Residual Norm = 1.622019e-05 + +# Output file: solvers_ij.out.1 +Iterations = 26 +Final Relative Residual Norm = 7.193490e-05 + +# Output file: solvers_ij.out.2 +GMRES Iterations = 4 +Final GMRES Relative Residual Norm = 1.566634e-05 + +# Output file: solvers_ij.out.3 +GMRES Iterations = 39 +Final GMRES Relative Residual Norm = 9.040770e-05 + +# Output file: solvers_ij.out.4 +Iterations = 5 +Final Relative Residual Norm = 1.566360e-05 + +# Output file: solvers_ij.out.5 +Iterations = 103 +Final Relative Residual Norm = 8.784404e-05 + +# Output file: solvers_ij.out.6 +GMRES Iterations = 15 +Final GMRES Relative Residual Norm = 7.131740e-05 + +# Output file: solvers_ij.out.7 +Iterations = 13 +Final Relative Residual Norm = 7.750608e-05 + +# Output file: solvers_ij.out.8 +Iterations = 26 +PCG_Iterations = 0 +DSCG_Iterations = 26 +Final Relative Residual Norm = 7.193490e-05 + +# Output file: solvers_ij.out.9 +Iterations = 7 +PCG_Iterations = 3 +DSCG_Iterations = 4 +Final Relative Residual Norm = 6.677458e-05 + +# Output file: solvers_ij.out.10 +Iterations = 6 +PCG_Iterations = 4 +DSCG_Iterations = 2 +Final Relative Residual Norm = 1.566634e-05 + +# Output file: solvers_ij.out.11 +Iterations = 5 +PCG_Iterations = 2 +DSCG_Iterations = 3 +Final Relative Residual Norm = 9.148609e-06 + +# Output file: solvers_ij.out.sysh + Average Convergence Factor = 0.122042 + + Complexity: grid = 1.613750 + operator = 2.860298 + cycle = 5.720429 + +# Output file: solvers_ij.out.sysn + Average Convergence Factor = 0.241065 + + Complexity: grid = 1.592000 + operator = 2.633619 + cycle = 11.267164 + +# Output file: solvers_ij.out.sysu + Average Convergence Factor = 0.413666 + + Complexity: grid = 1.614812 + operator = 2.865126 + cycle = 5.730247 + +# Output file: solvers_ij.out.101 +LGMRES Iterations = 39 +Final LGMRES Relative Residual Norm = 7.223550e-05 + +# Output file: solvers_ij.out.102 +LGMRES Iterations = 4 +Final LGMRES Relative Residual Norm = 1.566634e-05 + +# Output file: solvers_ij.out.103 +FlexGMRES Iterations = 39 +Final FlexGMRES Relative Residual Norm = 9.040783e-05 + +# Output file: solvers_ij.out.104 +FlexGMRES Iterations = 4 +Final FlexGMRES Relative Residual Norm = 1.567694e-05 + +# Output file: solvers_ij.out.105 +Iterations = 9 +Final Relative Residual Norm = 2.436101e-05 + +# Output file: solvers_ij.out.106 +Iterations = 9 +Final Relative Residual Norm = 2.436101e-05 + +# Output file: solvers_ij.out.107 +Iterations = 12 +Final Relative Residual Norm = 6.371664e-05 + +# Output file: solvers_ij.out.108 +Iterations = 12 +Final Relative Residual Norm = 6.371664e-05 + +# Output file: solvers_ij.out.109 +Iterations = 9 +Final Relative Residual Norm = 8.952725e-05 + +# Output file: solvers_ij.out.110 +Iterations = 9 +Final Relative Residual Norm = 8.952696e-05 + +# Output file: solvers_ij.out.111 +Iterations = 15 +Final Relative Residual Norm = 8.953455e-05 + +# Output file: solvers_ij.out.112 +GMRES Iterations = 12 +Final GMRES Relative Residual Norm = 5.915735e-05 + +# Output file: solvers_ij.out.113 +GMRES Iterations = 13 +Final GMRES Relative Residual Norm = 7.913169e-05 + +# Output file: solvers_ij.out.114 +BoomerAMG Iterations = 9 +Final Relative Residual Norm = 3.746825e-05 + +# Output file: solvers_ij.out.115 +BoomerAMG Iterations = 9 +Final Relative Residual Norm = 3.886564e-05 + +# Output file: solvers_ij.out.116 +GMRES Iterations = 6 +Final GMRES Relative Residual Norm = 3.062358e-05 + +# Output file: solvers_ij.out.117 +GMRES Iterations = 6 +Final GMRES Relative Residual Norm = 3.026567e-05 + diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_ij.sh hypre-2.13.0/src/test/TEST_single/solvers_ij.sh --- hypre-2.11.2/src/test/TEST_single/solvers_ij.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_single/solvers_ij.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,124 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + +TNAME=`basename $0 .sh` +CONVTOL=$1 + +# Set default check tolerance +if [ x$CONVTOL = "x" ]; +then + CONVTOL=0.0 +fi +#echo "tol = $CONVTOL" +#============================================================================= +# IJ: Run multiplicative and mult_additive cycle and compare results +# should be the same +#============================================================================= + +tail -17 ${TNAME}.out.109 | head -3 > ${TNAME}.testdata + +#============================================================================= + +tail -17 ${TNAME}.out.110 | head -3 > ${TNAME}.testdata.temp +#diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2 +../runcheck.sh ${TNAME}.testdata ${TNAME}.testdata.temp 1.e-4 >&2 + +#============================================================================= +# compare with baseline case +#============================================================================= + +FILES="\ + ${TNAME}.out.0\ + ${TNAME}.out.1\ + ${TNAME}.out.2\ + ${TNAME}.out.3\ + ${TNAME}.out.4\ + ${TNAME}.out.5\ + ${TNAME}.out.6\ + ${TNAME}.out.7\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done > ${TNAME}.out + +FILES="\ + ${TNAME}.out.8\ + ${TNAME}.out.9\ + ${TNAME}.out.10\ + ${TNAME}.out.11\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -5 $i +done >> ${TNAME}.out + +FILES="\ + ${TNAME}.out.sysh\ + ${TNAME}.out.sysn\ + ${TNAME}.out.sysu\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -21 $i | head -6 +done >> ${TNAME}.out + +FILES="\ + ${TNAME}.out.101\ + ${TNAME}.out.102\ + ${TNAME}.out.103\ + ${TNAME}.out.104\ + ${TNAME}.out.105\ + ${TNAME}.out.106\ + ${TNAME}.out.107\ + ${TNAME}.out.108\ + ${TNAME}.out.109\ + ${TNAME}.out.110\ + ${TNAME}.out.111\ + ${TNAME}.out.112\ + ${TNAME}.out.113\ + ${TNAME}.out.114\ + ${TNAME}.out.115\ + ${TNAME}.out.116\ + ${TNAME}.out.117\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done >> ${TNAME}.out + +# Make sure that the output files are reasonable +CHECK_LINE="Complexity" +OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l` +SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l` +if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then + echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2 +fi + +if [ -z $HYPRE_NO_SAVED ]; then + (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2 +fi + +#============================================================================= +# remove temporary files +#============================================================================= + +rm -f ${TNAME}.testdata* diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_struct.jobs hypre-2.13.0/src/test/TEST_single/solvers_struct.jobs --- hypre-2.11.2/src/test/TEST_single/solvers_struct.jobs 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_single/solvers_struct.jobs 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,63 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + + + + + + + +#============================================================================= +# struct: Run SMG-CG, PFMG-CG, DSCG, CG, and Hybrid. +#============================================================================= + +mpirun -np 3 ./struct -P 1 1 3 -solver 10 -tol 1.0e-4 > solvers_struct.out.0 +mpirun -np 3 ./struct -P 1 3 1 -solver 11 -tol 1.0e-4 > solvers_struct.out.1 +mpirun -np 3 ./struct -P 3 1 1 -solver 17 -tol 1.0e-4 > solvers_struct.out.2 +mpirun -np 1 ./struct -P 1 1 1 -solver 18 -tol 1.0e-4 > solvers_struct.out.3 +mpirun -np 1 ./struct -P 1 1 1 -solver 19 -tol 1.0e-4 > solvers_struct.out.4 + + +#============================================================================= +# Run default case with all available PCG preconditioners (solvers): +# 10: SMG (default) +# 11: PFMG +# 17: 2-step Jacobi +# 18: Diagonal scaling +# 19: none +#============================================================================= + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs +mpirun -np 2 ./struct -solver 10 -tol 1.e-4 > solvers_struct.out.10.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-4 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.10.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-4 -pcgitr 0 -seed 1 -vrand 3 > solvers_struct.out.10.lobpcg.3 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs +mpirun -np 2 ./struct -solver 11 -tol 1.e-4 > solvers_struct.out.11.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-4 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.11.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-4 -pcgitr 0 -seed 1 -vrand 3 > solvers_struct.out.11.lobpcg.3 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs +mpirun -np 2 ./struct -solver 17 > solvers_struct.out.17.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-4 -pcgitr 10 -seed 1 -vrand 1 > solvers_struct.out.17.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-4 -pcgitr 10 -seed 1 -vrand 3 > solvers_struct.out.17.lobpcg.3 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs +mpirun -np 2 ./struct -solver 18 > solvers_struct.out.18.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-4 -pcgitr 10 -seed 1 -vrand 1 > solvers_struct.out.18.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-4 -pcgitr 10 -seed 1 -vrand 3 > solvers_struct.out.18.lobpcg.3 + +# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs +mpirun -np 2 ./struct -solver 19 -tol 1.e-4 > solvers_struct.out.19.lobpcg +mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-4 -pcgitr 10 -seed 1 -vrand 1 > solvers_struct.out.19.lobpcg.1 +mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-4 -pcgitr 10 -seed 1 -vrand 3 > solvers_struct.out.19.lobpcg.3 diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_struct.saved hypre-2.13.0/src/test/TEST_single/solvers_struct.saved --- hypre-2.11.2/src/test/TEST_single/solvers_struct.saved 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_single/solvers_struct.saved 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,120 @@ +# Output file: solvers_struct.out.0 +Iterations = 3 +Final Relative Residual Norm = 3.246673e-05 + +# Output file: solvers_struct.out.1 +Iterations = 6 +Final Relative Residual Norm = 2.055852e-05 + +# Output file: solvers_struct.out.2 +Iterations = 16 +Final Relative Residual Norm = 5.377689e-05 + +# Output file: solvers_struct.out.3 +Iterations = 16 +Final Relative Residual Norm = 3.726248e-05 + +# Output file: solvers_struct.out.4 +Iterations = 16 +Final Relative Residual Norm = 3.718718e-05 + +# Output file: solvers_struct.out.10.lobpcg +Iterations = 3 +Final Relative Residual Norm = 6.275783e-06 + +# Output file: solvers_struct.out.10.lobpcg.1 +Eigenvalue lambda 1.84366211295128e-01 +Residual 2.48430933424970e-05 + +# Output file: solvers_struct.out.10.lobpcg.3 +Iteration 10 bsize 2 maxres 4.35155990999192e-04 +Iteration 11 bsize 1 maxres 2.05302669201046e-04 +Iteration 12 bsize 1 maxres 8.52039884193800e-05 + +Eigenvalue lambda 1.84366419911385e-01 +Eigenvalue lambda 2.50882804393768e-01 +Eigenvalue lambda 3.60089868307114e-01 +Residual 7.40827381378040e-05 +Residual 4.10445172747131e-05 +Residual 8.52039884193800e-05 + +# Output file: solvers_struct.out.11.lobpcg +Iterations = 6 +Final Relative Residual Norm = 2.112822e-05 + +# Output file: solvers_struct.out.11.lobpcg.1 +Eigenvalue lambda 1.84366270899773e-01 +Residual 3.14826756948605e-05 + +# Output file: solvers_struct.out.11.lobpcg.3 +Iteration 11 bsize 2 maxres 6.89935637637973e-04 +Iteration 12 bsize 2 maxres 2.52041267231107e-04 +Iteration 13 bsize 1 maxres 7.04026824678294e-05 + +Eigenvalue lambda 1.84366539120674e-01 +Eigenvalue lambda 2.50883042812347e-01 +Eigenvalue lambda 3.60092163085938e-01 +Residual 5.59787331440020e-05 +Residual 2.58176805800758e-05 +Residual 7.04026824678294e-05 + +# Output file: solvers_struct.out.17.lobpcg +Iterations = 20 +Final Relative Residual Norm = 4.194806e-07 + +# Output file: solvers_struct.out.17.lobpcg.1 +Eigenvalue lambda 1.84366390109062e-01 +Residual 1.95900083781453e-05 + +# Output file: solvers_struct.out.17.lobpcg.3 +Iteration 10 bsize 2 maxres 3.62457707524300e-04 +Iteration 11 bsize 1 maxres 1.69860562891699e-04 +Iteration 12 bsize 1 maxres 7.12833571014926e-05 + +Eigenvalue lambda 1.84366583824158e-01 +Eigenvalue lambda 2.50883996486664e-01 +Eigenvalue lambda 3.60090583562851e-01 +Residual 5.53683385078330e-05 +Residual 3.08582348225173e-05 +Residual 7.12833571014926e-05 + +# Output file: solvers_struct.out.18.lobpcg +Iterations = 33 +Final Relative Residual Norm = 8.027236e-07 + +# Output file: solvers_struct.out.18.lobpcg.1 +Eigenvalue lambda 1.84366077184677e-01 +Residual 4.44860852439888e-05 + +# Output file: solvers_struct.out.18.lobpcg.3 +Iteration 10 bsize 2 maxres 5.79534214921296e-04 +Iteration 11 bsize 1 maxres 1.98537352844141e-04 +Iteration 12 bsize 1 maxres 9.26745269680396e-05 + +Eigenvalue lambda 1.84366509318352e-01 +Eigenvalue lambda 2.50894546508789e-01 +Eigenvalue lambda 3.60091388225555e-01 +Residual 9.26745269680396e-05 +Residual 8.93285541678779e-05 +Residual 5.54441285203211e-05 + +# Output file: solvers_struct.out.19.lobpcg +Iterations = 25 +Final Relative Residual Norm = 7.712499e-05 + +# Output file: solvers_struct.out.19.lobpcg.1 +Eigenvalue lambda 1.84366524219513e-01 +Residual 4.43533899670001e-05 + +# Output file: solvers_struct.out.19.lobpcg.3 +Iteration 10 bsize 2 maxres 5.82181091886014e-04 +Iteration 11 bsize 1 maxres 1.99127141968347e-04 +Iteration 12 bsize 1 maxres 9.26545544643886e-05 + +Eigenvalue lambda 1.84366345405579e-01 +Eigenvalue lambda 2.50889092683792e-01 +Eigenvalue lambda 3.60089659690857e-01 +Residual 9.26545544643886e-05 +Residual 8.75688710948452e-05 +Residual 5.51430639461614e-05 + diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_struct.sh hypre-2.13.0/src/test/TEST_single/solvers_struct.sh --- hypre-2.11.2/src/test/TEST_single/solvers_struct.sh 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_single/solvers_struct.sh 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,79 @@ +#!/bin/sh +#BHEADER********************************************************************** +# Copyright (c) 2008, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# This file is part of HYPRE. See file COPYRIGHT for details. +# +# HYPRE is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License (as published by the Free +# Software Foundation) version 2.1 dated February 1999. +# +# $Revision$ +#EHEADER********************************************************************** + + + + + +TNAME=`basename $0 .sh` +CONVTOL=$1 + +# Set default check tolerance +if [ x$CONVTOL = "x" ]; +then + CONVTOL=0.0 +fi +#echo "tol = $CONVTOL" +#============================================================================= +# compare with baseline case +#============================================================================= + +FILES="\ + ${TNAME}.out.0\ + ${TNAME}.out.1\ + ${TNAME}.out.2\ + ${TNAME}.out.3\ + ${TNAME}.out.4\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i +done > ${TNAME}.out + +FILES="\ + ${TNAME}.out.10.lobpcg\ + ${TNAME}.out.11.lobpcg\ + ${TNAME}.out.17.lobpcg\ + ${TNAME}.out.18.lobpcg\ + ${TNAME}.out.19.lobpcg\ +" + +for i in $FILES +do + echo "# Output file: $i" + tail -3 $i + echo "# Output file: $i.1" + tail -13 $i.1 | head -3 + echo "# Output file: $i.3" + tail -21 $i.3 | head -11 +done >> ${TNAME}.out + +# Make sure that the output files are reasonable +CHECK_LINE="Iterations" +OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l` +SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l` +if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then + echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2 +fi + +if [ -z $HYPRE_NO_SAVED ]; then + (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2 +fi + +#============================================================================= +# remove temporary files +#============================================================================= + +# rm -f ${TNAME}.testdata* diff -Nru hypre-2.11.2/src/test/TEST_sstruct/solvers.saved hypre-2.13.0/src/test/TEST_sstruct/solvers.saved --- hypre-2.11.2/src/test/TEST_sstruct/solvers.saved 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_sstruct/solvers.saved 2017-10-20 17:42:22.000000000 +0000 @@ -87,70 +87,70 @@ Final Relative Residual Norm = 6.429522e-07 # Output file: solvers.out.10.lobpcg.1 -Eigenvalue lambda 1.34883860790786e+00 -Residual 2.40758202827198e-03 +Eigenvalue lambda 1.34880853089918e+00 +Residual 1.40515754419491e-04 # Output file: solvers.out.10.lobpcg.4 -Eigenvalue lambda 1.34880848246868e+00 -Eigenvalue lambda 1.36560847743331e+00 -Eigenvalue lambda 1.47908699641224e+00 -Eigenvalue lambda 1.49589925871990e+00 -Residual 2.16895015738541e-06 -Residual 2.26924940977357e-06 -Residual 2.30461087038239e-06 -Residual 2.31349855264711e-06 +Eigenvalue lambda 1.34880848246852e+00 +Eigenvalue lambda 1.36560847743334e+00 +Eigenvalue lambda 1.47908699641636e+00 +Eigenvalue lambda 1.49589925873578e+00 +Residual 1.88411068757809e-06 +Residual 1.92352730596892e-06 +Residual 2.62402461132032e-06 +Residual 5.33772292781341e-06 # Output file: solvers.out.11.lobpcg Iterations = 24 Final Relative Residual Norm = 6.654613e-07 # Output file: solvers.out.11.lobpcg.1 -Eigenvalue lambda 1.34884333064988e+00 -Residual 2.59663483715351e-03 +Eigenvalue lambda 1.34880853049745e+00 +Residual 1.27609473406123e-04 # Output file: solvers.out.11.lobpcg.4 -Eigenvalue lambda 1.34880848246851e+00 -Eigenvalue lambda 1.36560847743324e+00 -Eigenvalue lambda 1.47908699641209e+00 -Eigenvalue lambda 1.49589925871908e+00 -Residual 1.80601168111517e-06 -Residual 2.08224805414380e-06 -Residual 2.43864085668235e-06 -Residual 2.07089097060174e-06 +Eigenvalue lambda 1.34880848246853e+00 +Eigenvalue lambda 1.36560847743866e+00 +Eigenvalue lambda 1.47908699650267e+00 +Eigenvalue lambda 1.49589925889085e+00 +Residual 2.34138605050087e-06 +Residual 2.11050112570845e-06 +Residual 2.34273161452232e-06 +Residual 4.55133814558691e-06 # Output file: solvers.out.18.lobpcg Iterations = 25 Final Relative Residual Norm = 9.124482e-07 # Output file: solvers.out.18.lobpcg.1 -Eigenvalue lambda 1.34880848247360e+00 -Residual 1.27710890197410e-06 +Eigenvalue lambda 1.34880848247421e+00 +Residual 1.68760664468992e-06 # Output file: solvers.out.18.lobpcg.4 -Eigenvalue lambda 1.34880848246838e+00 -Eigenvalue lambda 1.36560847743320e+00 -Eigenvalue lambda 1.47908699641198e+00 -Eigenvalue lambda 1.49589925871866e+00 -Residual 1.35084619640744e-06 -Residual 1.98965928996447e-06 -Residual 1.13053053095074e-06 -Residual 2.44142594272676e-06 +Eigenvalue lambda 1.34880848246833e+00 +Eigenvalue lambda 1.36560847743301e+00 +Eigenvalue lambda 1.47908699641137e+00 +Eigenvalue lambda 1.49589925871590e+00 +Residual 7.29467860779249e-07 +Residual 9.62135023675930e-07 +Residual 1.05238502112812e-06 +Residual 1.42836272383011e-06 # Output file: solvers.out.19.lobpcg Iterations = 25 Final Relative Residual Norm = 9.124482e-07 # Output file: solvers.out.19.lobpcg.1 -Eigenvalue lambda 1.34880848247360e+00 -Residual 1.27710890239225e-06 +Eigenvalue lambda 1.34880848247421e+00 +Residual 1.68760664640186e-06 # Output file: solvers.out.19.lobpcg.4 -Eigenvalue lambda 1.34880848246840e+00 -Eigenvalue lambda 1.36560847743319e+00 -Eigenvalue lambda 1.47908699641203e+00 -Eigenvalue lambda 1.49589925871871e+00 -Residual 1.35084619677939e-06 -Residual 1.98965929201569e-06 -Residual 1.13053053130770e-06 -Residual 2.44142594408303e-06 +Eigenvalue lambda 1.34880848246915e+00 +Eigenvalue lambda 1.36560847743475e+00 +Eigenvalue lambda 1.47908699641975e+00 +Eigenvalue lambda 1.49589925875392e+00 +Residual 7.29467866482847e-07 +Residual 9.62135024900930e-07 +Residual 1.05238500241244e-06 +Residual 1.42836275555007e-06 diff -Nru hypre-2.11.2/src/test/TEST_struct/solvers.saved hypre-2.13.0/src/test/TEST_struct/solvers.saved --- hypre-2.11.2/src/test/TEST_struct/solvers.saved 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/TEST_struct/solvers.saved 2017-10-20 17:42:22.000000000 +0000 @@ -23,98 +23,98 @@ Final Relative Residual Norm = 1.355288e-07 # Output file: solvers.out.10.lobpcg.1 -Eigenvalue lambda 1.84366453092285e-01 -Residual 9.48102341087739e-07 +Eigenvalue lambda 1.84366453091860e-01 +Residual 4.52048636947317e-07 # Output file: solvers.out.10.lobpcg.5 -Eigenvalue lambda 1.84366453091761e-01 -Eigenvalue lambda 2.50882493969895e-01 -Eigenvalue lambda 3.60090369737266e-01 -Eigenvalue lambda 4.20845334658830e-01 -Eigenvalue lambda 4.20845334659278e-01 -Residual 3.22224736401280e-07 -Residual 4.14402445882080e-07 -Residual 9.42143466466521e-07 -Residual 7.77020993206880e-07 -Residual 7.94159301934379e-07 +Eigenvalue lambda 1.84366453091770e-01 +Eigenvalue lambda 2.50882493969728e-01 +Eigenvalue lambda 3.60090369737200e-01 +Eigenvalue lambda 4.20845334658571e-01 +Eigenvalue lambda 4.20845334658890e-01 +Residual 3.35930189890736e-07 +Residual 3.11260814535858e-07 +Residual 7.33053164875689e-07 +Residual 5.14259223167152e-07 +Residual 7.32161520953280e-07 # Output file: solvers.out.11.lobpcg Iterations = 8 Final Relative Residual Norm = 4.807900e-07 # Output file: solvers.out.11.lobpcg.1 -Eigenvalue lambda 1.84366453091815e-01 -Residual 3.85721107269604e-07 +Eigenvalue lambda 1.84366453092320e-01 +Residual 1.05361158868911e-06 # Output file: solvers.out.11.lobpcg.5 -Eigenvalue lambda 1.84366453091761e-01 -Eigenvalue lambda 2.50882493969760e-01 -Eigenvalue lambda 3.60090369737228e-01 -Eigenvalue lambda 4.20845334658737e-01 -Eigenvalue lambda 4.20845334659947e-01 -Residual 2.05382831978917e-07 -Residual 4.59866546803063e-07 -Residual 9.90818911866409e-07 -Residual 1.33480975037863e-06 -Residual 1.29716729646718e-06 +Eigenvalue lambda 1.84366453091757e-01 +Eigenvalue lambda 2.50882493969849e-01 +Eigenvalue lambda 3.60090369737247e-01 +Eigenvalue lambda 4.20845334660032e-01 +Eigenvalue lambda 4.20845334660337e-01 +Residual 1.94648827543750e-07 +Residual 1.22160192258141e-06 +Residual 7.71834588037176e-07 +Residual 1.28057049961329e-06 +Residual 1.26100802396762e-06 # Output file: solvers.out.17.lobpcg Iterations = 17 Final Relative Residual Norm = 8.241147e-07 # Output file: solvers.out.17.lobpcg.1 -Eigenvalue lambda 1.84366453091981e-01 -Residual 6.58030973127707e-07 +Eigenvalue lambda 1.84366453091822e-01 +Residual 2.74559569275018e-07 # Output file: solvers.out.17.lobpcg.5 -Eigenvalue lambda 1.84366453091754e-01 -Eigenvalue lambda 2.50882493969533e-01 -Eigenvalue lambda 3.60090369737259e-01 -Eigenvalue lambda 4.20845334658560e-01 -Eigenvalue lambda 4.20845334659094e-01 -Residual 2.18600906457648e-07 -Residual 2.01409973718392e-07 -Residual 7.10622693277451e-07 -Residual 8.92153133859236e-07 -Residual 9.10269958132537e-07 +Eigenvalue lambda 1.84366453091761e-01 +Eigenvalue lambda 2.50882493969758e-01 +Eigenvalue lambda 3.60090369737186e-01 +Eigenvalue lambda 4.20845334658611e-01 +Eigenvalue lambda 4.20845334658816e-01 +Residual 2.41984718651313e-07 +Residual 1.16310823945663e-06 +Residual 6.95498018578113e-07 +Residual 7.11395856759138e-07 +Residual 7.20734654889730e-07 # Output file: solvers.out.18.lobpcg Iterations = 32 Final Relative Residual Norm = 8.259590e-07 # Output file: solvers.out.18.lobpcg.1 -Eigenvalue lambda 1.84366453091822e-01 -Residual 2.06395470109285e-07 +Eigenvalue lambda 1.84366453091920e-01 +Residual 4.04811751101852e-07 # Output file: solvers.out.18.lobpcg.5 -Eigenvalue lambda 1.84366453091753e-01 -Eigenvalue lambda 2.50882493969752e-01 -Eigenvalue lambda 3.60090369737251e-01 -Eigenvalue lambda 4.20845334658871e-01 -Eigenvalue lambda 4.20845334659104e-01 -Residual 1.23291603592449e-07 -Residual 4.17547991069515e-07 -Residual 1.17426104064181e-06 -Residual 1.09268026031814e-06 -Residual 9.56432798770891e-07 +Eigenvalue lambda 1.84366453091760e-01 +Eigenvalue lambda 2.50882493969440e-01 +Eigenvalue lambda 3.60090369737180e-01 +Eigenvalue lambda 4.20845334658791e-01 +Eigenvalue lambda 4.20845334659400e-01 +Residual 6.51620151548372e-07 +Residual 1.04113397676966e-06 +Residual 1.05795257133655e-06 +Residual 1.01625954175797e-06 +Residual 1.09288008877697e-06 # Output file: solvers.out.19.lobpcg Iterations = 32 Final Relative Residual Norm = 8.259590e-07 # Output file: solvers.out.19.lobpcg.1 -Eigenvalue lambda 1.84366453091822e-01 -Residual 2.06395470089861e-07 +Eigenvalue lambda 1.84366453091920e-01 +Residual 4.04811751079993e-07 # Output file: solvers.out.19.lobpcg.5 -Eigenvalue lambda 1.84366453091755e-01 -Eigenvalue lambda 2.50882493969748e-01 -Eigenvalue lambda 3.60090369737247e-01 -Eigenvalue lambda 4.20845334658872e-01 -Eigenvalue lambda 4.20845334659103e-01 -Residual 1.23291603639448e-07 -Residual 4.17547991296164e-07 -Residual 1.17426103999361e-06 -Residual 1.09271838882886e-06 -Residual 9.55993938911220e-07 +Eigenvalue lambda 1.84366453091760e-01 +Eigenvalue lambda 2.50882493969706e-01 +Eigenvalue lambda 3.60090369737178e-01 +Eigenvalue lambda 4.20845334658819e-01 +Eigenvalue lambda 4.20845334659484e-01 +Residual 6.51620151941224e-07 +Residual 1.04113396974827e-06 +Residual 1.05795252287838e-06 +Residual 1.01635836602686e-06 +Residual 1.09013903468312e-06 diff -Nru hypre-2.11.2/src/test/zboxloop.c hypre-2.13.0/src/test/zboxloop.c --- hypre-2.11.2/src/test/zboxloop.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/test/zboxloop.c 2017-10-20 17:42:22.000000000 +0000 @@ -38,7 +38,8 @@ HYPRE_Int rep, reps, fail, sum; HYPRE_Int size; hypre_Box *x1_data_box, *x2_data_box, *x3_data_box, *x4_data_box; - HYPRE_Int xi1, xi2, xi3, xi4; + //HYPRE_Int xi1, xi2, xi3, xi4; + HYPRE_Int xi1; HYPRE_Real *xp1, *xp2, *xp3, *xp4; hypre_Index loop_size, start, unit_stride, index; @@ -192,7 +193,7 @@ zypre_BoxLoop1Begin(dim, loop_size, x1_data_box, start, unit_stride, xi1); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif zypre_BoxLoop1For(xi1) { @@ -203,9 +204,8 @@ /* Use old boxloop to check that values are set to 1 */ fail = 0; sum = 0; - hypre_BoxLoop1Begin(3, loop_size, - x1_data_box, start, unit_stride, xi1); - hypre_BoxLoop1For(xi1) + hypre_SerialBoxLoop1Begin(3, loop_size, + x1_data_box, start, unit_stride, xi1); { sum += xp1[xi1]; if (xp1[xi1] != 1) @@ -216,7 +216,7 @@ fail = 1; } } - hypre_BoxLoop1End(xi1); + hypre_SerialBoxLoop1End(xi1); if (sum != (nx*ny*nz)) { @@ -251,7 +251,7 @@ hypre_BoxLoop0For() { xp1[xi1] += xp1[xi1]; - xi1++; + //xi1++; } hypre_BoxLoop0End(); } @@ -265,7 +265,7 @@ hypre_BoxLoop1Begin(3, loop_size, x1_data_box, start, unit_stride, xi1); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop1For(xi1) { @@ -284,7 +284,7 @@ x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop2For(xi1, xi2) { @@ -304,7 +304,7 @@ x2_data_box, start, unit_stride, xi2, x3_data_box, start, unit_stride, xi3); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2,xi3) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop3For(xi1, xi2, xi3) { @@ -325,7 +325,7 @@ x3_data_box, start, unit_stride, xi3, x4_data_box, start, unit_stride, xi4); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2,xi3,xi4) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif hypre_BoxLoop4For(xi1, xi2, xi3, xi4) { @@ -370,7 +370,7 @@ zypre_BoxLoop1Begin(dim, loop_size, x1_data_box, start, unit_stride, xi1); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif zypre_BoxLoop1For(xi1) { @@ -389,7 +389,7 @@ x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif zypre_BoxLoop2For(xi1, xi2) { @@ -409,7 +409,7 @@ x2_data_box, start, unit_stride, xi2, x3_data_box, start, unit_stride, xi3); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2,xi3) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif zypre_BoxLoop3For(xi1, xi2, xi3) { @@ -430,7 +430,7 @@ x3_data_box, start, unit_stride, xi3, x4_data_box, start, unit_stride, xi4); #ifdef HYPRE_USING_OPENMP -#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2,xi3,xi4) HYPRE_SMP_SCHEDULE +#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE #endif zypre_BoxLoop4For(xi1, xi2, xi3, xi4) { diff -Nru hypre-2.11.2/src/utilities/amg_linklist.h hypre-2.13.0/src/utilities/amg_linklist.h --- hypre-2.11.2/src/utilities/amg_linklist.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/amg_linklist.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,8 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - /****************************************************************************** * * Header file link lists @@ -46,3 +44,4 @@ #endif #endif + diff -Nru hypre-2.11.2/src/utilities/caliper_instrumentation.h hypre-2.13.0/src/utilities/caliper_instrumentation.h --- hypre-2.11.2/src/utilities/caliper_instrumentation.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/caliper_instrumentation.h 2017-10-20 17:42:22.000000000 +0000 @@ -36,3 +36,4 @@ #endif #endif /* CALIPER_INSTRUMENTATION_HEADER */ + diff -Nru hypre-2.11.2/src/utilities/exchange_data.h hypre-2.13.0/src/utilities/exchange_data.h --- hypre-2.11.2/src/utilities/exchange_data.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/exchange_data.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,7 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - #ifndef hypre_EXCHANGE_DATA_HEADER #define hypre_EXCHANGE_DATA_HEADER @@ -19,7 +18,6 @@ #define hypre_BinaryTreeChildIds(tree) (tree->child_id) #define hypre_BinaryTreeChildId(tree, i) (tree->child_id[i]) - typedef struct { HYPRE_Int parent_id; @@ -27,8 +25,6 @@ HYPRE_Int *child_id; } hypre_BinaryTree; - - /* In the fill_response() function the user needs to set the recv__buf and the response_message_size. Memory of size send_response_storage has been alllocated for the send_buf (in exchange_data) - if more is needed, then @@ -37,7 +33,6 @@ If the response is an empty "confirmation" message, then set response_message_size =0 (and do not modify the send_buf) */ - typedef struct { HYPRE_Int (*fill_response)(void* recv_buf, HYPRE_Int contact_size, @@ -51,11 +46,9 @@ } hypre_DataExchangeResponse; - HYPRE_Int hypre_CreateBinaryTree(HYPRE_Int, HYPRE_Int, hypre_BinaryTree*); HYPRE_Int hypre_DestroyBinaryTree(hypre_BinaryTree*); - HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts, HYPRE_Int *contact_proc_list, void *contact_send_buf, HYPRE_Int *contact_send_buf_starts, HYPRE_Int contact_obj_size, @@ -64,5 +57,5 @@ HYPRE_Int rnum, MPI_Comm comm, void **p_response_recv_buf, HYPRE_Int **p_response_recv_buf_starts); - #endif /* end of header */ + diff -Nru hypre-2.11.2/src/utilities/general.h hypre-2.13.0/src/utilities/general.h --- hypre-2.11.2/src/utilities/general.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/general.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,7 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - /****************************************************************************** * * General structures and values @@ -20,6 +19,15 @@ #ifndef hypre_GENERAL_HEADER #define hypre_GENERAL_HEADER +/* This allows us to consistently avoid 'int' throughout hypre */ +typedef int hypre_int; +typedef long int hypre_longint; +typedef unsigned int hypre_uint; +typedef unsigned long int hypre_ulongint; + +/* This allows us to consistently avoid 'double' throughout hypre */ +typedef double hypre_double; + /*-------------------------------------------------------------------------- * Define various functions *--------------------------------------------------------------------------*/ @@ -44,3 +52,4 @@ #endif #endif + diff -Nru hypre-2.11.2/src/utilities/gpgpu.h hypre-2.13.0/src/utilities/gpgpu.h --- hypre-2.11.2/src/utilities/gpgpu.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/gpgpu.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,19 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +//#define CUDAMEMATTACHTYPE cudaMemAttachGlobal +//#define CUDAMEMATTACHTYPE cudaMemAttachHost +#define HYPRE_GPU_USE_PINNED 1 +#define HYPRE_USE_MANAGED_SCALABLE 1 +#endif + diff -Nru hypre-2.11.2/src/utilities/gpuErrorCheck.c hypre-2.13.0/src/utilities/gpuErrorCheck.c --- hypre-2.11.2/src/utilities/gpuErrorCheck.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/gpuErrorCheck.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,111 @@ + +#include "_hypre_utilities.h" + +#if defined(HYPRE_USE_GPU) || defined(HYPRE_USE_MANAGED) +#include +#ifdef HYPRE_USE_GPU +extern const char *cusparseErrorCheck(cusparseStatus_t error); +extern void gpuAssert(cudaError_t code, const char *file, int line); +extern void cusparseAssert(cusparseStatus_t code, const char *file, int line); +#endif + +/* + cudaSafeFree frees Managed memory allocated in hypre_MAlloc,hypre_CAlloc and hypre_ReAlloc + It checks if the memory is managed before freeing and emits a warning if it is not memory + allocated using the above routines. This behaviour can be changed by defining ABORT_ON_RAW_POINTER. + The core file can then be used to find the location of the anomalous hypre_Free. + */ +void cudaSafeFree(void *ptr,int padding) +{ + PUSH_RANGE("SAFE_FREE",3); + struct cudaPointerAttributes ptr_att; + size_t *sptr=(size_t*)ptr-padding; + cudaError_t err; + + err=cudaPointerGetAttributes(&ptr_att,ptr); + if (err!=cudaSuccess){ + cudaGetLastError(); +#define FULL_WARN +#ifndef ABORT_ON_RAW_POINTER +#ifdef FULL_WARN + if (err==cudaErrorInvalidValue) fprintf(stderr,"WARNING :: Raw pointer passed to cudaSafeFree %p\n",ptr); + if (err==cudaErrorInvalidDevice) fprintf(stderr,"WARNING :: cudaSafeFree :: INVALID DEVICE on ptr = %p\n",ptr); + //PrintPointerAttributes(ptr); +#endif +#else + fprintf(stderr,"ERROR:: cudaSafeFree Aborting on raw unmanaged pointer %p\n",ptr); + raise(SIGABRT); +#endif + free(ptr); /* Free the nonManaged pointer */ + return; + } + if (ptr_att.isManaged){ +#if defined(HYPRE_USE_GPU) && defined(HYPRE_MEASURE_GPU_HWM) + size_t mfree,mtotal; + gpuErrchk(cudaMemGetInfo(&mfree,&mtotal)); + HYPRE_GPU_HWM=hypre_max((mtotal-mfree),HYPRE_GPU_HWM); +#endif + /* Code below for handling managed memory pointers not allocated using hypre_CTAlloc oir hypre_TAlooc */ + if (PointerAttributes(ptr)!=PointerAttributes(sptr)){ + //fprintf(stderr,"ERROR IN Pointer for freeing %p %p\n",ptr,sptr); + gpuErrchk(cudaFree(ptr)); + return; + } + gpuErrchk(cudaFree(sptr)); + } else { + /* It is a pinned memory pointer */ + //printf("ERROR:: NON-managed pointer passed to cudaSafeFree\n"); + if (ptr_att.memoryType==cudaMemoryTypeHost){ + gpuErrchk(cudaFreeHost(sptr)); + } else if (ptr_att.memoryType==cudaMemoryTypeDevice){ + gpuErrchk(cudaFree(sptr)); + } + } + POP_RANGE; + return; +} +hypre_int PrintPointerAttributes(const void *ptr){ + struct cudaPointerAttributes ptr_att; + if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess){ + cudaGetLastError(); + fprintf(stderr,"PrintPointerAttributes:: Raw pointer %p\n",ptr); + return HYPRE_HOST_POINTER; + } + if (ptr_att.isManaged){ + fprintf(stderr,"PrintPointerAttributes:: Managed pointer\n"); + fprintf(stderr,"Host address = %p, Device Address = %p\n",ptr_att.hostPointer, ptr_att.devicePointer); + if (ptr_att.memoryType==cudaMemoryTypeHost) fprintf(stderr,"Memory is located on host\n"); + if (ptr_att.memoryType==cudaMemoryTypeDevice) fprintf(stderr,"Memory is located on device\n"); + fprintf(stderr,"Device associated with this pointer is %d\n",ptr_att.device); + return HYPRE_MANAGED_POINTER; + } else { + fprintf(stderr,"PrintPointerAttributes:: Non-Managed & non-raw pointer\n Probably pinned host pointer\n"); + if (ptr_att.memoryType==cudaMemoryTypeHost) { + fprintf(stderr,"Memory is located on host\n"); + return HYPRE_PINNED_POINTER; + } + if (ptr_att.memoryType==cudaMemoryTypeDevice) { + fprintf(stderr,"Memory is located on device\n"); + return HYPRE_DEVICE_POINTER ; + } + return HYPRE_UNDEFINED_POINTER1; + } + return HYPRE_UNDEFINED_POINTER2; +} +hypre_int PointerAttributes(const void *ptr){ + struct cudaPointerAttributes ptr_att; + if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess){ + cudaGetLastError(); + return HYPRE_HOST_POINTER; + } + if (ptr_att.isManaged){ + return HYPRE_MANAGED_POINTER; + } else { + if (ptr_att.memoryType==cudaMemoryTypeHost) return HYPRE_PINNED_POINTER; /* Host pointer from cudaMallocHost */ + if (ptr_att.memoryType==cudaMemoryTypeDevice) return HYPRE_DEVICE_POINTER ; /* cudadevice pointer */ + return HYPRE_UNDEFINED_POINTER1; /* Shouldn't happen */ + } + return HYPRE_UNDEFINED_POINTER2; /* Shouldnt happen */ +} + +#endif diff -Nru hypre-2.11.2/src/utilities/gpuErrorCheck.h hypre-2.13.0/src/utilities/gpuErrorCheck.h --- hypre-2.11.2/src/utilities/gpuErrorCheck.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/gpuErrorCheck.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,153 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#ifdef HYPRE_USE_MANAGED +#include +#define CUDAMEMATTACHTYPE cudaMemAttachGlobal +#define MEM_PAD_LEN 1 +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file); + fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code)); + exit(2); + } +} +#define HYPRE_HOST_POINTER 0 +#define HYPRE_MANAGED_POINTER 1 +#define HYPRE_PINNED_POINTER 2 +#define HYPRE_DEVICE_POINTER 3 +#define HYPRE_UNDEFINED_POINTER1 4 +#define HYPRE_UNDEFINED_POINTER2 5 +void cudaSafeFree(void *ptr,int padding); +hypre_int PrintPointerAttributes(const void *ptr); +hypre_int PointerAttributes(const void *ptr); +#endif + +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +#ifndef __cusparseErrorCheck__ +#define __cusparseErrorCheck__ +#include +#include +#include +//#include +#include +inline const char *cusparseErrorCheck(cusparseStatus_t error) +{ + switch (error) + { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: + return "Unknown error in cusparseErrorCheck"; + } + +} +inline const char *cublasErrorCheck(cublasStatus_t error) +{ + switch (error) + { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + default: + return "Unknown error in cublasErrorCheck"; + } + +} +//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +//inline void gpuAssert(cudaError_t code, const char *file, int line) +//{ +// if (code != cudaSuccess) +// { +// fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file); +// fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code)); +// exit(2); +// } +//} +#define cusparseErrchk(ans) { cusparseAssert((ans), __FILE__, __LINE__); } +inline void cusparseAssert(cusparseStatus_t code, const char *file, int line) +{ + if (code != CUSPARSE_STATUS_SUCCESS) + { + fprintf(stderr,"CUSPARSE ERROR ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file); + fprintf(stderr,"CUSPARSE ERROR : %s \n", cusparseErrorCheck(code)); + } +} +#define cublasErrchk(ans){ cublasAssert((ans), __FILE__, __LINE__); } +inline void cublasAssert(cublasStatus_t code, const char *file, int line) +{ + if (code != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr,"CUBLAS ERROR ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file); + fprintf(stderr,"CUBLAS ERROR : %s \n", cublasErrorCheck(code)); + } +} +//int PointerType(const void *ptr); +void cudaSafeFree(void *ptr,int padding); +//void PrintPointerAttributes(const void *ptr); +//size_t mempush(void* ptr, size_t size,int purge); +//int memloc(void *ptr, int device); +#endif +#endif + diff -Nru hypre-2.11.2/src/utilities/gpuMem.c hypre-2.13.0/src/utilities/gpuMem.c --- hypre-2.11.2/src/utilities/gpuMem.c 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/gpuMem.c 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,513 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "_hypre_utilities.h" +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +#include +#include + +#include +#include +hypre_int ggc(hypre_int id); + +/* Global struct that holds device,library handles etc */ +struct hypre__global_struct hypre__global_handle = { .initd=0, .device=0, .device_count=1,.memoryHWM=0}; + + +/* Initialize GPU branch of Hypre AMG */ +/* use_device =-1 */ +/* Application passes device number it is using or -1 to let Hypre decide on which device to use */ +void hypre_GPUInit(hypre_int use_device){ + char pciBusId[80]; + hypre_int myid; + hypre_int nDevices; + hypre_int device; + if (!HYPRE_GPU_HANDLE){ + HYPRE_GPU_HANDLE=1; + HYPRE_DEVICE=0; + gpuErrchk(cudaGetDeviceCount(&nDevices)); + HYPRE_DEVICE_COUNT=nDevices; + + if (use_device<0){ + if (nDevices==1){ + /* with mpibind each process will only see 1 GPU */ + HYPRE_DEVICE=0; + gpuErrchk(cudaSetDevice(HYPRE_DEVICE)); + cudaDeviceGetPCIBusId ( pciBusId, 80, HYPRE_DEVICE); + } else if (nDevices>1) { + /* No mpibind or it is a single rank run */ + hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); + //affs(myid); + MPI_Comm node_comm; + MPI_Info info; + MPI_Info_create(&info); + MPI_Comm_split_type(hypre_MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, myid, info, &node_comm); + hypre_int round_robin=1; + hypre_int myNodeid, NodeSize; + MPI_Comm_rank(node_comm, &myNodeid); + MPI_Comm_size(node_comm, &NodeSize); + if (round_robin){ + /* Round robin allocation of GPUs. Does not account for affinities */ + HYPRE_DEVICE=myNodeid%nDevices; + gpuErrchk(cudaSetDevice(HYPRE_DEVICE)); + cudaDeviceGetPCIBusId ( pciBusId, 80, HYPRE_DEVICE); + hypre_printf("WARNING:: Code running without mpibind\n"); + hypre_printf("Global ID = %d , Node ID %d running on device %d of %d \n",myid,myNodeid,HYPRE_DEVICE,nDevices); + } else { + /* Try to set the GPU based on process binding */ + /* works correcly for all cases */ + MPI_Comm numa_comm; + MPI_Comm_split(node_comm,getnuma(),myNodeid,&numa_comm); + hypre_int myNumaId,NumaSize; + MPI_Comm_rank(numa_comm, &myNumaId); + MPI_Comm_size(numa_comm, &NumaSize); + hypre_int domain_devices=nDevices/2; /* Again hardwired for 2 NUMA domains */ + HYPRE_DEVICE = getnuma()*2+myNumaId%domain_devices; + gpuErrchk(cudaSetDevice(HYPRE_DEVICE)); + hypre_printf("WARNING:: Code running without mpibind\n"); + hypre_printf("NUMA %d GID %d , NodeID %d NumaID %d running on device %d (RR=%d) of %d \n",getnuma(),myid,myNodeid,myNumaId,HYPRE_DEVICE,myNodeid%nDevices,nDevices); + + } + + MPI_Info_free(&info); + } else { + /* No device found */ + hypre_fprintf(stderr,"ERROR:: NO GPUS found \n"); + exit(2); + } + } else { + HYPRE_DEVICE = use_device; + gpuErrchk(cudaSetDevice(HYPRE_DEVICE)); + } + + /* Create NVTX domain for all the nvtx calls in HYPRE */ + HYPRE_DOMAIN=nvtxDomainCreateA("Hypre"); + + /* Initialize streams */ + hypre_int jj; + for(jj=0;jj0){ + PrintPointerAttributes(ptr); + gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream)); + gpuErrchk(cudaStreamSynchronize(stream)); + POP_RANGE; + return; + } + return; +} + + +void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream){ + if (ptr==NULL) return; + size_t size=memsize(ptr); + PUSH_RANGE_PAYLOAD("MemPreFetchForce",4,size); + gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream)); + POP_RANGE; + return; +} + +void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream){ + if (ptr==NULL) return; + PUSH_RANGE_DOMAIN("MemPreFetchSized",4,0); + /* Do a prefetch every time until a possible UM bug is fixed */ + if (size>0){ + gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream)); + POP_RANGE_DOMAIN(0); + return; + } + return; +} + + +/* Returns the same cublas handle with every call */ +cublasHandle_t getCublasHandle(){ + cublasStatus_t stat; + static cublasHandle_t handle; + static hypre_int firstcall=1; + if (firstcall){ + firstcall=0; + stat = cublasCreate(&handle); + if (stat!=CUBLAS_STATUS_SUCCESS) { + printf("ERROR:: CUBLAS Library initialization failed\n"); + handle=0; + exit(2); + } + cublasErrchk(cublasSetStream(handle,HYPRE_STREAM(4))); + } else return handle; + return handle; +} + +/* Returns the same cusparse handle with every call */ +cusparseHandle_t getCusparseHandle(){ + cusparseStatus_t status; + static cusparseHandle_t handle; + static hypre_int firstcall=1; + if (firstcall){ + firstcall=0; + status= cusparseCreate(&handle); + if (status != CUSPARSE_STATUS_SUCCESS) { + printf("ERROR:: CUSPARSE Library initialization failed\n"); + handle=0; + exit(2); + } + cusparseErrchk(cusparseSetStream(handle,HYPRE_STREAM(4))); + } else return handle; + return handle; +} + +/* C version of mempush using linked lists */ + +size_t mempush(const void *ptr, size_t size, hypre_int action){ + static node* head=NULL; + static hypre_int nc=0; + node *found=NULL; + if (!head){ + if ((size<=0)||(action==1)) { + fprintf(stderr,"mempush can start only with an insertion or a size call \n"); + return 0; + } + head = (node*)malloc(sizeof(node)); + head->ptr=ptr; + head->size=size; + head->next=NULL; + nc++; + return size; + } else { + // Purge an address + if (action==1){ + found=memfind(head,ptr); + if (found){ + memdel(&head, found); + nc--; + return 0; + } else { +#ifdef FULL_WARN + fprintf(stderr,"ERROR :: Pointer for deletion not found in linked list %p\n",ptr); +#endif + return 0; + } + } // End purge + + // Insertion + if (size>0){ + found=memfind(head,ptr); + if (found){ +#ifdef FULL_WARN + fprintf(stderr,"ERROR :: Pointer for insertion already in use in linked list %p\n",ptr); + //printlist(head,nc); +#endif + return 0; + } else { + nc++; + meminsert(&head,ptr,size); + return 0; + } + } + + // Getting allocation size + found=memfind(head,ptr); + if (found){ + return found->size; + } else{ +#ifdef FULL_WARN + fprintf(stderr,"ERROR :: Pointer for size check NOT found in linked list\n"); +#endif + return 0; + } + } +} + +node *memfind(node *head, const void *ptr){ + node *next; + next=head; + while(next!=NULL){ + if (next->ptr==ptr) return next; + next=next->next; + } + return NULL; +} + +void memdel(node **head, node *found){ + node *next; + if (found==*head){ + next=(*head)->next; + free(*head); + *head=next; + return; + } + next=*head; + while(next->next!=found){ + next=next->next; + } + next->next=next->next->next; + free(found); + return; +} +void meminsert(node **head, const void *ptr,size_t size){ + node *nhead; + nhead = (node*)malloc(sizeof(node)); + nhead->ptr=ptr; + nhead->size=size; + nhead->next=*head; + *head=nhead; + return; +} + +void printlist(node *head,hypre_int nc){ + node *next; + next=head; + printf("Node count %d \n",nc); + while(next!=NULL){ + printf("Address %p of size %zu \n",next->ptr,next->size); + next=next->next; + } +} + +cudaStream_t getstreamOlde(hypre_int i){ + static hypre_int firstcall=1; + const hypre_int MAXSTREAMS=10; + static cudaStream_t s[MAXSTREAMS]; + if (firstcall){ + hypre_int jj; + for(jj=0;jjsum1) return 0; + else return 1; + } else { + fprintf(stderr,"sched_affinity failed\n"); + switch(errno){ + case EFAULT: + printf("INVALID MEMORY ADDRESS\n"); + break; + case EINVAL: + printf("EINVAL:: NO VALID CPUS\n"); + break; + default: + printf("%d something else\n",errno); + } + } + return 0; + CPU_FREE(mask); + +} +hypre_int checkDeviceProps(){ + struct cudaDeviceProp prop; + gpuErrchk(cudaGetDeviceProperties(&prop, HYPRE_DEVICE)); + HYPRE_GPU_CMA=prop.concurrentManagedAccess; + return HYPRE_GPU_CMA; +} +hypre_int pointerIsManaged(const void *ptr){ + struct cudaPointerAttributes ptr_att; + if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess) { + return 0; + } + return ptr_att.isManaged; +} +#endif diff -Nru hypre-2.11.2/src/utilities/gpuMem.h hypre-2.13.0/src/utilities/gpuMem.h --- hypre-2.11.2/src/utilities/gpuMem.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/gpuMem.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,104 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +#ifndef __GPUMEM_H__ +#define __GPUMEM_H__ +#ifdef HYPRE_USE_GPU +#include +void hypre_GPUInit(hypre_int use_device); +void hypre_GPUFinalize(); +int VecScaleScalar(double *u, const double alpha, int num_rows,cudaStream_t s); +void VecCopy(double* tgt, const double* src, int size,cudaStream_t s); +void VecSet(double* tgt, int size, double value, cudaStream_t s); +void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void CudaCompileFlagCheck(); +#endif + +cudaStream_t getstreamOlde(hypre_int i); +nvtxDomainHandle_t getdomain(hypre_int i); +cudaEvent_t getevent(hypre_int i); +void MemAdviseReadOnly(const void *ptr, hypre_int device); +void MemAdviseUnSetReadOnly(const void *ptr, hypre_int device); +void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device); +void MemAdviseSetPrefLocHost(const void *ptr); +void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream); +void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream); +void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream); +cublasHandle_t getCublasHandle(); +cusparseHandle_t getCusparseHandle(); +typedef struct node { + const void *ptr; + size_t size; + struct node *next; +} node; +size_t mempush(const void *ptr, size_t size, hypre_int action); +node *memfind(node *head, const void *ptr); +void memdel(node **head, node *found); +void meminsert(node **head, const void *ptr,size_t size); +void printlist(node *head,hypre_int nc); +//#define MEM_PAD_LEN 1 +size_t memsize(const void *ptr); +hypre_int getsetasyncmode(hypre_int mode, hypre_int action); +void SetAsyncMode(hypre_int mode); +hypre_int GetAsyncMode(); +void branchStream(hypre_int i, hypre_int j); +void joinStreams(hypre_int i, hypre_int j, hypre_int k); +void affs(hypre_int myid); +hypre_int getcore(); +hypre_int getnuma(); +hypre_int checkDeviceProps(); +hypre_int pointerIsManaged(const void *ptr); +/* + * Global struct for keeping HYPRE GPU Init state + */ + +#define MAX_HGS_ELEMENTS 10 +struct hypre__global_struct{ + hypre_int initd; + hypre_int device; + hypre_int device_count; + cublasHandle_t cublas_handle; + cusparseHandle_t cusparse_handle; + cusparseMatDescr_t cusparse_mat_descr; + cudaStream_t streams[MAX_HGS_ELEMENTS]; + nvtxDomainHandle_t nvtx_domain; + hypre_int concurrent_managed_access; + size_t memoryHWM; +}; + +extern struct hypre__global_struct hypre__global_handle ; + +/* + * Macros for accessing elements of the global handle + */ +#define HYPRE_GPU_HANDLE hypre__global_handle.initd +#define HYPRE_CUBLAS_HANDLE hypre__global_handle.cublas_handle +#define HYPRE_CUSPARSE_HANDLE hypre__global_handle.cusparse_handle +#define HYPRE_DEVICE hypre__global_handle.device +#define HYPRE_DEVICE_COUNT hypre__global_handle.device_count +#define HYPRE_CUSPARSE_MAT_DESCR hypre__global_handle.cusparse_mat_descr +#define HYPRE_STREAM(index) (hypre__global_handle.streams[index]) +#define HYPRE_DOMAIN hypre__global_handle.nvtx_domain +#define HYPRE_GPU_CMA hypre__global_handle.concurrent_managed_access +#define HYPRE_GPU_HWM hypre__global_handle.memoryHWM + +#endif + +#else + +#define hypre_GPUInit(use_device) +#define hypre_GPUFinalize() + +#endif + diff -Nru hypre-2.11.2/src/utilities/headers hypre-2.13.0/src/utilities/headers --- hypre-2.11.2/src/utilities/headers 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/headers 2017-10-20 17:42:22.000000000 +0000 @@ -19,11 +19,18 @@ cat > $INTERNAL_HEADER <<@ +/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/ + + #ifndef hypre_UTILITIES_HEADER #define hypre_UTILITIES_HEADER #include "HYPRE_utilities.h" +#ifdef HYPRE_USING_OPENMP +#include +#endif + #ifdef __cplusplus extern "C" { #endif @@ -38,19 +45,19 @@ cat mpistubs.h >> $INTERNAL_HEADER cat hypre_smp.h >> $INTERNAL_HEADER cat hypre_memory.h >> $INTERNAL_HEADER -cat thread_mpistubs.h >> $INTERNAL_HEADER cat threading.h >> $INTERNAL_HEADER cat timing.h >> $INTERNAL_HEADER cat amg_linklist.h >> $INTERNAL_HEADER cat exchange_data.h >> $INTERNAL_HEADER cat hypre_error.h >> $INTERNAL_HEADER -./protos amg_linklist.c >> $INTERNAL_HEADER -./protos binsearch.c >> $INTERNAL_HEADER -./protos hypre_printf.c >> $INTERNAL_HEADER -./protos hypre_qsort.c >> $INTERNAL_HEADER -./protos qsplit.c >> $INTERNAL_HEADER -./protos random.c >> $INTERNAL_HEADER - +cat caliper_instrumentation.h >> $INTERNAL_HEADER +cat gpgpu.h >> $INTERNAL_HEADER +cat hypre_nvtx.h >> $INTERNAL_HEADER +cat gpuErrorCheck.h >> $INTERNAL_HEADER +cat gpuMem.h >> $INTERNAL_HEADER + +cat protos.h >> $INTERNAL_HEADER + #=========================================================================== # Include guards #=========================================================================== diff -Nru hypre-2.11.2/src/utilities/hypre_error.h hypre-2.13.0/src/utilities/hypre_error.h --- hypre-2.11.2/src/utilities/hypre_error.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/hypre_error.h 2017-10-20 17:42:22.000000000 +0000 @@ -10,8 +10,6 @@ * $Revision$ ***********************************************************************EHEADER*/ - - #ifndef hypre_ERROR_HEADER #define hypre_ERROR_HEADER @@ -37,3 +35,4 @@ #endif #endif + diff -Nru hypre-2.11.2/src/utilities/hypre_memory.c hypre-2.13.0/src/utilities/hypre_memory.c --- hypre-2.11.2/src/utilities/hypre_memory.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/hypre_memory.c 2017-10-20 17:42:22.000000000 +0000 @@ -16,12 +16,14 @@ * *****************************************************************************/ +#define HYPRE_USE_MANAGED_SCALABLE 1 #include "_hypre_utilities.h" - +//#include "gpgpu.h" +//#include "hypre_nvtx.h" +//#include "gpuMem.h" #ifdef HYPRE_USE_UMALLOC #undef HYPRE_USE_UMALLOC #endif - /****************************************************************************** * * Standard routines @@ -54,10 +56,23 @@ if (size > 0) { + PUSH_RANGE_PAYLOAD("MALLOC",2,size); #ifdef HYPRE_USE_UMALLOC HYPRE_Int threadid = hypre_GetThreadID(); - +#ifdef HYPRE_USE_MANAGED + printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n"); +#endif ptr = _umalloc_(size); +#elif HYPRE_USE_MANAGED +#ifdef HYPRE_USE_MANAGED_SCALABLE + gpuErrchk( cudaMallocManaged(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,CUDAMEMATTACHTYPE) ); + size_t *sp=(size_t*)ptr; + *sp=size; + ptr=(void*)(&sp[MEM_PAD_LEN]); +#else + gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) ); + mempush(ptr,size,0); +#endif #else ptr = malloc(size); #endif @@ -68,6 +83,7 @@ hypre_OutOfMemory(size); } #endif + POP_RANGE; } else { @@ -90,10 +106,23 @@ if (size > 0) { + PUSH_RANGE_PAYLOAD("MALLOC",4,size); #ifdef HYPRE_USE_UMALLOC +#ifdef HYPRE_USE_MANAGED + printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n"); +#endif HYPRE_Int threadid = hypre_GetThreadID(); ptr = _ucalloc_(count, elt_size); +#elif HYPRE_USE_MANAGED +#ifdef HYPRE_USE_MANAGED_SCALABLE + ptr=(void*)hypre_MAlloc(size); + memset(ptr,0,count*elt_size); +#else + gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) ); + memset(ptr,0,count*elt_size); + mempush(ptr,size,0); +#endif #else ptr = calloc(count, elt_size); #endif @@ -104,6 +133,7 @@ hypre_OutOfMemory(size); } #endif + POP_RANGE; } else { @@ -113,6 +143,11 @@ return(char*) ptr; } +#ifdef HYPRE_USE_MANAGED +size_t memsize(const void *ptr){ +return ((size_t*)ptr)[-MEM_PAD_LEN]; +} +#endif /*-------------------------------------------------------------------------- * hypre_ReAlloc *--------------------------------------------------------------------------*/ @@ -135,6 +170,32 @@ HYPRE_Int threadid = hypre_GetThreadID(); ptr = (char*)_urealloc_(ptr, size); } +#elif HYPRE_USE_MANAGED + if (ptr == NULL) + { + + ptr = hypre_MAlloc(size); + } + else if (size == 0) + { + hypre_Free(ptr); + return NULL; + } + else + { + void *nptr = hypre_MAlloc(size); +#ifdef HYPRE_USE_MANAGED_SCALABLE + size_t old_size=memsize((void*)ptr); +#else + size_t old_size=mempush((void*)ptr,0,0); +#endif + if (size>old_size) + memcpy(nptr,ptr,old_size); + else + memcpy(nptr,ptr,size); + hypre_Free(ptr); + ptr=(char*) nptr; + } #else if (ptr == NULL) { @@ -156,6 +217,7 @@ return ptr; } + /*-------------------------------------------------------------------------- * hypre_Free *--------------------------------------------------------------------------*/ @@ -169,6 +231,182 @@ HYPRE_Int threadid = hypre_GetThreadID(); _ufree_(ptr); +#elif HYPRE_USE_MANAGED + //size_t size=mempush(ptr,0,0); +#ifdef HYPRE_USE_MANAGED_SCALABLE + cudaSafeFree(ptr,MEM_PAD_LEN); +#else + mempush(ptr,0,1); + cudaSafeFree(ptr,0); +#endif + //gpuErrchk(cudaFree((void*)ptr)); +#else + free(ptr); +#endif + } +} +/*-------------------------------------------------------------------------- + * hypre_MAllocPinned + *--------------------------------------------------------------------------*/ + +char * +hypre_MAllocPinned( size_t size ) +{ + void *ptr; + + if (size > 0) + { + PUSH_RANGE_PAYLOAD("MALLOC",2,size); +#ifdef HYPRE_USE_UMALLOC + HYPRE_Int threadid = hypre_GetThreadID(); +#ifdef HYPRE_USE_MANAGED + printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n"); +#endif + ptr = _umalloc_(size); +#elif HYPRE_USE_MANAGED +#ifdef HYPRE_USE_MANAGED_SCALABLE +#ifdef HYPRE_GPU_USE_PINNED + gpuErrchk( cudaHostAlloc(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,cudaHostAllocMapped)); +#else + gpuErrchk( cudaMallocManaged(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,CUDAMEMATTACHTYPE) ); +#endif + size_t *sp=(size_t*)ptr; + *sp=size; + ptr=(void*)(&sp[MEM_PAD_LEN]); +#else + gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) ); + mempush(ptr,size,0); +#endif +#else + ptr = malloc(size); +#endif + +#if 1 + if (ptr == NULL) + { + hypre_OutOfMemory(size); + } +#endif + POP_RANGE; + } + else + { + ptr = NULL; + } + + return (char*)ptr; +} +/*-------------------------------------------------------------------------- + * hypre_MAllocHost + *--------------------------------------------------------------------------*/ + +char * +hypre_MAllocHost( size_t size ) +{ + void *ptr; + + if (size > 0) + { + ptr = malloc(size); +#if 1 + if (ptr == NULL) + { + hypre_OutOfMemory(size); + } +#endif + POP_RANGE; + } + else + { + ptr = NULL; + } + + return (char*)ptr; +} + +/*-------------------------------------------------------------------------- + * hypre_CAllocHost + *--------------------------------------------------------------------------*/ + +char * +hypre_CAllocHost( size_t count, + size_t elt_size ) +{ + void *ptr; + size_t size = count*elt_size; + + if (size > 0) + { + PUSH_RANGE_PAYLOAD("CAllocHost",4,size); +#ifdef HYPRE_USE_UMALLOC +#ifdef HYPRE_USE_MANAGED + printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n"); +#endif + HYPRE_Int threadid = hypre_GetThreadID(); + +ptr = _ucalloc_(count, elt_size); + +#else + ptr = calloc(count, elt_size); +#endif + +#if 1 + if (ptr == NULL) + { + hypre_OutOfMemory(size); + } +#endif + POP_RANGE; + } + else + { + ptr = NULL; + } + + return(char*) ptr; +} +/*-------------------------------------------------------------------------- + * hypre_ReAllocHost + *--------------------------------------------------------------------------*/ + +char * +hypre_ReAllocHost( char *ptr, + size_t size ) +{ + if (ptr == NULL) + { + ptr = (char*)malloc(size); + } + else + { + + ptr = (char*)realloc(ptr, size); + } + +#if 1 + if ((ptr == NULL) && (size > 0)) + { + hypre_OutOfMemory(size); + } +#endif + + return ptr; +} + +/*-------------------------------------------------------------------------- + * hypre_CHFree + *--------------------------------------------------------------------------*/ + +void +hypre_FreeHost( char *ptr ) +{ + if (ptr) + { +#ifdef HYPRE_USE_UMALLOC + HYPRE_Int threadid = hypre_GetThreadID(); + + _ufree_(ptr); + #else free(ptr); #endif diff -Nru hypre-2.11.2/src/utilities/hypre_memory.h hypre-2.13.0/src/utilities/hypre_memory.h --- hypre-2.11.2/src/utilities/hypre_memory.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/hypre_memory.h 2017-10-20 17:42:22.000000000 +0000 @@ -26,6 +26,157 @@ extern "C" { #endif +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED) +#ifdef __cplusplus +extern "C++" { +#endif +#include +#include +#ifdef __cplusplus +} +#endif +#define HYPRE_CUDA_GLOBAL __host__ __device__ + +#if defined(HYPRE_MEMORY_GPU) +#define hypre_DeviceTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1; \ + } \ + ptr;}) + +#define hypre_DeviceCTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + } \ + cudaMemset(ptr,0,sizeof(type)*(count)); \ + ptr;}) \ + +#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr; \ + cudaMalloc((void**)&,sizeof(type)*(count), cudaMemAttachGlobal); \ + memcpy(newptr, ptr, sizeof(type)*(count)); \ + cudaFree(ptr); \ + ptr = newptr;} +#else + #define hypre_DeviceTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);\ + if ( cudaerr != cudaSuccess ) { \ + printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + }\ + ptr;}) + +#define hypre_DeviceCTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + } \ + cudaMemset(ptr,0,sizeof(type)*(count)); \ + ptr;}) \ + +#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr; \ + cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + memcpy(newptr, ptr, sizeof(type)*(count)); \ + cudaFree(ptr); \ + ptr = newptr;} +#endif + +#define hypre_DeviceTFree(ptr) \ + { \ + cudaError_t cudaerr = cudaFree(ptr); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n CudaFree : %s in %s(%d) function %s\n",cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1; \ + } \ + } \ + + +#define hypre_DataCopyToData(ptrH,ptrD,type,count) \ + {cudaError_t cudaerr = cudaMemcpy(ptrD, ptrH, sizeof(type)*count, cudaMemcpyHostToDevice); \ +if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCopyToData %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ +} \ + } + + +#define hypre_DataCopyFromData(ptrH,ptrD,type,count) \ + {cudaError_t cudaerr = cudaMemcpy(ptrH, ptrD, sizeof(type)*count, cudaMemcpyDeviceToHost); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + }\ + } + +#define hypre_DeviceMemset(ptr,value,type,count) \ + cudaMemset(ptr,value,count*sizeof(type)); + +#define hypre_UMTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + ptr; \ + }) + +#define hypre_UMCTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + cudaMemset(ptr,0,sizeof(type)*(count)); \ + ptr;}) \ + + +#define hypre_UMTReAlloc(type, count)\ + ({ \ + type * ptr; \ + type *newptr; \ + cudaMallocManaged((void**)&newptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + cudaFree(ptr); \ + ptr = newptr; \ + ptr;}) \ + +#define hypre_UMTFree(ptr) \ + cudaFree(ptr) + +#define hypre_InitMemoryDebug(id) +#define hypre_FinalizeMemoryDebug() + +#define hypre_TAlloc(type, count) \ +( (type *)hypre_MAlloc((size_t)(sizeof(type) * (count))) ) + +#define hypre_CTAlloc(type, count) \ +( (type *)hypre_CAlloc((size_t)(count), (size_t)sizeof(type)) ) + +#define hypre_TReAlloc(ptr, type, count) \ +( (type *)hypre_ReAlloc((char *)ptr, (size_t)(sizeof(type) * (count))) ) + +#define hypre_TFree(ptr) \ +( hypre_Free((char *)ptr), ptr = NULL ) + + //#define hypre_TAlloc(type, count) hypre_UMTAlloc(type, count) + //#define hypre_CTAlloc(type, count) hypre_UMCTAlloc(type, count) + //#define hypre_TReAlloc(ptr, type, count) hypre_UMTReAlloc(type, count) + //#define hypre_TFree(ptr) hypre_UMTFree(ptr) + +#define hypre_SharedTAlloc(type, count) hypre_TAlloc(type, (count)) +#define hypre_SharedCTAlloc(type, count) hypre_CTAlloc(type, (count)) +#define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count)) +#define hypre_SharedTFree(ptr) hypre_TFree(ptr) +#else +#define HYPRE_CUDA_GLOBAL + /*-------------------------------------------------------------------------- * Use "Debug Malloc Library", dmalloc *--------------------------------------------------------------------------*/ @@ -79,6 +230,34 @@ #define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count)) #define hypre_SharedTFree(ptr) hypre_TFree(ptr) +#define hypre_DeviceTAlloc(type, count) hypre_TAlloc(type, (count)) +#define hypre_DeviceCTAlloc(type, count) hypre_CTAlloc(type, (count)) +#define hypre_DeviceTReAlloc(type, count) hypre_TReAlloc(type, (count)) +#define hypre_DeviceTFree(ptr) hypre_TFree(ptr) +#define hypre_DataCopyToData(ptrH,ptrD,type,count) memcpy(ptrD, ptrH, sizeof(type)*(count)) +#define hypre_DataCopyFromData(ptrH,ptrD,type,count) memcpy(ptrH, ptrD, sizeof(type)*(count)) +#define hypre_DeviceMemset(ptr,value,type,count) memset(ptr,value,count*sizeof(type)) +#define hypre_UMTAlloc(type, count) hypre_TAlloc(type, (count)) +#define hypre_UMCTAlloc(type, count) hypre_CTAlloc(type, (count)) +#define hypre_UMTReAlloc(type, count) hypre_TReAlloc(type, (count)) +#define hypre_UMTFree(ptr) hypre_TFree(ptr) +#endif + +#define hypre_PinnedTAlloc(type, count)\ +( (type *)hypre_MAllocPinned((size_t)(sizeof(type) * (count))) ) + +#define hypre_HostTAlloc(type, count) \ +( (type *)hypre_MAllocHost((size_t)(sizeof(type) * (count))) ) + +#define hypre_HostCTAlloc(type, count) \ +( (type *)hypre_CAllocHost((size_t)(count), (size_t)sizeof(type)) ) + +#define hypre_HostTReAlloc(ptr, type, count) \ +( (type *)hypre_ReAllocHost((char *)ptr, (size_t)(sizeof(type) * (count))) ) + +#define hypre_HostTFree(ptr) \ +( hypre_FreeHost((char *)ptr), ptr = NULL ) + /*-------------------------------------------------------------------------- * Prototypes *--------------------------------------------------------------------------*/ @@ -87,8 +266,13 @@ HYPRE_Int hypre_OutOfMemory ( size_t size ); char *hypre_MAlloc ( size_t size ); char *hypre_CAlloc ( size_t count , size_t elt_size ); +char *hypre_MAllocPinned( size_t size ); char *hypre_ReAlloc ( char *ptr , size_t size ); void hypre_Free ( char *ptr ); +char *hypre_CAllocHost( size_t count,size_t elt_size ); +char *hypre_MAllocHost( size_t size ); +char *hypre_ReAllocHost( char *ptr,size_t size ); +void hypre_FreeHost( char *ptr ); char *hypre_SharedMAlloc ( size_t size ); char *hypre_SharedCAlloc ( size_t count , size_t elt_size ); char *hypre_SharedReAlloc ( char *ptr , size_t size ); @@ -108,3 +292,4 @@ #endif #endif + diff -Nru hypre-2.11.2/src/utilities/hypre_nvtx.h hypre-2.13.0/src/utilities/hypre_nvtx.h --- hypre-2.11.2/src/utilities/hypre_nvtx.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/hypre_nvtx.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,72 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#ifdef USE_NVTX +#include "nvToolsExt.h" +#include "nvToolsExtCudaRt.h" + +static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; +static const int num_colors = sizeof(colors)/sizeof(uint32_t); + +#define PUSH_RANGE(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \ +} + +#define PUSH_RANGE_PAYLOAD(name,cid,load) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_INT64; \ + eventAttrib.payload.llValue = load; \ + eventAttrib.category=1; \ + nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \ +} + +#define PUSH_RANGE_DOMAIN(name,cid,dId) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxDomainRangePushEx(getdomain(dId),&eventAttrib); \ +} + +#define POP_RANGE nvtxDomainRangePop(HYPRE_DOMAIN); +#define POP_RANGE_DOMAIN(dId) { \ + nvtxDomainRangePop(getdomain(dId)); \ + } +#else +#define PUSH_RANGE(name,cid) +#define POP_RANGE +#define PUSH_RANGE_PAYLOAD(name,cid,load) +#define PUSH_RANGE_DOMAIN(name,cid,domainName) +#endif + diff -Nru hypre-2.11.2/src/utilities/hypre_printf.c hypre-2.13.0/src/utilities/hypre_printf.c --- hypre-2.11.2/src/utilities/hypre_printf.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/hypre_printf.c 2017-10-20 17:42:22.000000000 +0000 @@ -14,7 +14,7 @@ #include #include -#ifdef HYPRE_BIGINT +// #ifdef HYPRE_BIGINT /* these prototypes are missing by default for some compilers */ int vscanf( const char *format , va_list arg ); @@ -42,32 +42,50 @@ } else if (foundpercent) { + if (*fp == 'l') + { + fp++; /* remove 'l' and maybe add it back in switch statement */ + if (*fp == 'l') + { + fp++; /* remove second 'l' if present */ + } + } switch(*fp) { case 'd': + case 'i': +#if defined(HYPRE_BIGINT) *nfp = 'l'; nfp++; *nfp = 'l'; nfp++; - case 'c': +#endif + foundpercent = 0; break; + case 'f': case 'e': case 'E': - case 'f': case 'g': case 'G': - case 'i': +#if defined(HYPRE_SINGLE) /* no modifier */ +#elif defined(HYPRE_LONG_DOUBLE) /* modify with 'L' */ + *nfp = 'L'; nfp++; +#else /* modify with 'l' (default is _double_) */ + *nfp = 'l'; nfp++; +#endif + foundpercent = 0; break; + case 'c': case 'n': case 'o': case 'p': case 's': case 'u': case 'x': - case 'S': + case 'X': case '%': - foundpercent = 0; + foundpercent = 0; break; } } *nfp = *fp; nfp++; } - *nfp = *fp; nfp++; + *nfp = *fp; *newformat_ptr = newformat; @@ -79,9 +97,7 @@ HYPRE_Int free_format( char *newformat ) { -#ifdef HYPRE_BIGINT hypre_TFree(newformat); -#endif return 0; } @@ -186,9 +202,9 @@ return ierr; } -#else - -/* this is used only to eliminate compiler warnings */ -HYPRE_Int hypre_printf_empty; - -#endif +// #else +// +// /* this is used only to eliminate compiler warnings */ +// HYPRE_Int hypre_printf_empty; +// +// #endif diff -Nru hypre-2.11.2/src/utilities/_hypre_utilities.h hypre-2.13.0/src/utilities/_hypre_utilities.h --- hypre-2.11.2/src/utilities/_hypre_utilities.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/_hypre_utilities.h 2017-10-20 17:42:22.000000000 +0000 @@ -1,14 +1,6 @@ -/*BHEADER********************************************************************** - * Copyright (c) 2008, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * This file is part of HYPRE. See file COPYRIGHT for details. - * - * HYPRE is free software; you can redistribute it and/or modify it under the - * terms of the GNU Lesser General Public License (as published by the Free - * Software Foundation) version 2.1 dated February 1999. - * - * $Revision$ - ***********************************************************************EHEADER*/ + +/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/ + #ifndef hypre_UTILITIES_HEADER #define hypre_UTILITIES_HEADER @@ -19,15 +11,6 @@ #include #endif -/* This allows us to consistently avoid 'int' throughout hypre */ -typedef int hypre_int; -typedef long int hypre_longint; -typedef unsigned int hypre_uint; -typedef unsigned long int hypre_ulongint; - -/* This allows us to consistently avoid 'double' throughout hypre */ -typedef double hypre_double; - #ifdef __cplusplus extern "C" { #endif @@ -53,6 +36,15 @@ #ifndef hypre_GENERAL_HEADER #define hypre_GENERAL_HEADER +/* This allows us to consistently avoid 'int' throughout hypre */ +typedef int hypre_int; +typedef long int hypre_longint; +typedef unsigned int hypre_uint; +typedef unsigned long int hypre_ulongint; + +/* This allows us to consistently avoid 'double' throughout hypre */ +typedef double hypre_double; + /*-------------------------------------------------------------------------- * Define various functions *--------------------------------------------------------------------------*/ @@ -130,7 +122,9 @@ #define MPI_BOTTOM hypre_MPI_BOTTOM +#define MPI_FLOAT hypre_MPI_FLOAT #define MPI_DOUBLE hypre_MPI_DOUBLE +#define MPI_LONG_DOUBLE hypre_MPI_LONG_DOUBLE #define MPI_INT hypre_MPI_INT #define MPI_LONG_LONG_INT hypre_MPI_INT #define MPI_CHAR hypre_MPI_CHAR @@ -142,6 +136,7 @@ #define MPI_MIN hypre_MPI_MIN #define MPI_MAX hypre_MPI_MAX #define MPI_LOR hypre_MPI_LOR +#define MPI_LAND hypre_MPI_LAND #define MPI_SUCCESS hypre_MPI_SUCCESS #define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE @@ -232,18 +227,21 @@ #define hypre_MPI_BOTTOM 0x0 -#define hypre_MPI_DOUBLE 0 -#define hypre_MPI_INT 1 -#define hypre_MPI_CHAR 2 -#define hypre_MPI_LONG 3 -#define hypre_MPI_BYTE 4 -#define hypre_MPI_REAL 5 -#define hypre_MPI_COMPLEX 6 +#define hypre_MPI_FLOAT 0 +#define hypre_MPI_DOUBLE 1 +#define hypre_MPI_LONG_DOUBLE 2 +#define hypre_MPI_INT 3 +#define hypre_MPI_CHAR 4 +#define hypre_MPI_LONG 5 +#define hypre_MPI_BYTE 6 +#define hypre_MPI_REAL 7 +#define hypre_MPI_COMPLEX 8 #define hypre_MPI_SUM 0 #define hypre_MPI_MIN 1 #define hypre_MPI_MAX 2 #define hypre_MPI_LOR 3 +#define hypre_MPI_LAND 4 #define hypre_MPI_SUCCESS 0 #define hypre_MPI_STATUSES_IGNORE 0 @@ -272,7 +270,9 @@ #define hypre_MPI_BOTTOM MPI_BOTTOM #define hypre_MPI_COMM_SELF MPI_COMM_SELF +#define hypre_MPI_FLOAT MPI_FLOAT #define hypre_MPI_DOUBLE MPI_DOUBLE +#define hypre_MPI_LONG_DOUBLE MPI_LONG_DOUBLE /* HYPRE_MPI_INT is defined in HYPRE_utilities.h */ #define hypre_MPI_INT HYPRE_MPI_INT #define hypre_MPI_CHAR MPI_CHAR @@ -288,6 +288,7 @@ #define hypre_MPI_MAX MPI_MAX #define hypre_MPI_LOR MPI_LOR #define hypre_MPI_SUCCESS MPI_SUCCESS +#define hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE #define hypre_MPI_UNDEFINED MPI_UNDEFINED #define hypre_MPI_REQUEST_NULL MPI_REQUEST_NULL @@ -295,7 +296,6 @@ #define hypre_MPI_ANY_TAG MPI_ANY_TAG #define hypre_MPI_SOURCE MPI_SOURCE #define hypre_MPI_TAG MPI_TAG -#define hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE #define hypre_MPI_LAND MPI_LAND #endif @@ -415,6 +415,157 @@ extern "C" { #endif +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED) +#ifdef __cplusplus +extern "C++" { +#endif +#include +#include +#ifdef __cplusplus +} +#endif +#define HYPRE_CUDA_GLOBAL __host__ __device__ + +#if defined(HYPRE_MEMORY_GPU) +#define hypre_DeviceTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1; \ + } \ + ptr;}) + +#define hypre_DeviceCTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + } \ + cudaMemset(ptr,0,sizeof(type)*(count)); \ + ptr;}) \ + +#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr; \ + cudaMalloc((void**)&,sizeof(type)*(count), cudaMemAttachGlobal); \ + memcpy(newptr, ptr, sizeof(type)*(count)); \ + cudaFree(ptr); \ + ptr = newptr;} +#else + #define hypre_DeviceTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);\ + if ( cudaerr != cudaSuccess ) { \ + printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + }\ + ptr;}) + +#define hypre_DeviceCTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + } \ + cudaMemset(ptr,0,sizeof(type)*(count)); \ + ptr;}) \ + +#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr; \ + cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + memcpy(newptr, ptr, sizeof(type)*(count)); \ + cudaFree(ptr); \ + ptr = newptr;} +#endif + +#define hypre_DeviceTFree(ptr) \ + { \ + cudaError_t cudaerr = cudaFree(ptr); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n CudaFree : %s in %s(%d) function %s\n",cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1; \ + } \ + } \ + + +#define hypre_DataCopyToData(ptrH,ptrD,type,count) \ + {cudaError_t cudaerr = cudaMemcpy(ptrD, ptrH, sizeof(type)*count, cudaMemcpyHostToDevice); \ +if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCopyToData %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ +} \ + } + + +#define hypre_DataCopyFromData(ptrH,ptrD,type,count) \ + {cudaError_t cudaerr = cudaMemcpy(ptrH, ptrD, sizeof(type)*count, cudaMemcpyDeviceToHost); \ + if ( cudaerr != cudaSuccess ) { \ + printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \ + HYPRE_Int *p = NULL; *p = 1;\ + }\ + } + +#define hypre_DeviceMemset(ptr,value,type,count) \ + cudaMemset(ptr,value,count*sizeof(type)); + +#define hypre_UMTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + ptr; \ + }) + +#define hypre_UMCTAlloc(type, count) \ + ({ \ + type * ptr; \ + cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + cudaMemset(ptr,0,sizeof(type)*(count)); \ + ptr;}) \ + + +#define hypre_UMTReAlloc(type, count)\ + ({ \ + type * ptr; \ + type *newptr; \ + cudaMallocManaged((void**)&newptr,sizeof(type)*(count), cudaMemAttachGlobal); \ + cudaFree(ptr); \ + ptr = newptr; \ + ptr;}) \ + +#define hypre_UMTFree(ptr) \ + cudaFree(ptr) + +#define hypre_InitMemoryDebug(id) +#define hypre_FinalizeMemoryDebug() + +#define hypre_TAlloc(type, count) \ +( (type *)hypre_MAlloc((size_t)(sizeof(type) * (count))) ) + +#define hypre_CTAlloc(type, count) \ +( (type *)hypre_CAlloc((size_t)(count), (size_t)sizeof(type)) ) + +#define hypre_TReAlloc(ptr, type, count) \ +( (type *)hypre_ReAlloc((char *)ptr, (size_t)(sizeof(type) * (count))) ) + +#define hypre_TFree(ptr) \ +( hypre_Free((char *)ptr), ptr = NULL ) + + //#define hypre_TAlloc(type, count) hypre_UMTAlloc(type, count) + //#define hypre_CTAlloc(type, count) hypre_UMCTAlloc(type, count) + //#define hypre_TReAlloc(ptr, type, count) hypre_UMTReAlloc(type, count) + //#define hypre_TFree(ptr) hypre_UMTFree(ptr) + +#define hypre_SharedTAlloc(type, count) hypre_TAlloc(type, (count)) +#define hypre_SharedCTAlloc(type, count) hypre_CTAlloc(type, (count)) +#define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count)) +#define hypre_SharedTFree(ptr) hypre_TFree(ptr) +#else +#define HYPRE_CUDA_GLOBAL + /*-------------------------------------------------------------------------- * Use "Debug Malloc Library", dmalloc *--------------------------------------------------------------------------*/ @@ -468,6 +619,34 @@ #define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count)) #define hypre_SharedTFree(ptr) hypre_TFree(ptr) +#define hypre_DeviceTAlloc(type, count) hypre_TAlloc(type, (count)) +#define hypre_DeviceCTAlloc(type, count) hypre_CTAlloc(type, (count)) +#define hypre_DeviceTReAlloc(type, count) hypre_TReAlloc(type, (count)) +#define hypre_DeviceTFree(ptr) hypre_TFree(ptr) +#define hypre_DataCopyToData(ptrH,ptrD,type,count) memcpy(ptrD, ptrH, sizeof(type)*(count)) +#define hypre_DataCopyFromData(ptrH,ptrD,type,count) memcpy(ptrH, ptrD, sizeof(type)*(count)) +#define hypre_DeviceMemset(ptr,value,type,count) memset(ptr,value,count*sizeof(type)) +#define hypre_UMTAlloc(type, count) hypre_TAlloc(type, (count)) +#define hypre_UMCTAlloc(type, count) hypre_CTAlloc(type, (count)) +#define hypre_UMTReAlloc(type, count) hypre_TReAlloc(type, (count)) +#define hypre_UMTFree(ptr) hypre_TFree(ptr) +#endif + +#define hypre_PinnedTAlloc(type, count)\ +( (type *)hypre_MAllocPinned((size_t)(sizeof(type) * (count))) ) + +#define hypre_HostTAlloc(type, count) \ +( (type *)hypre_MAllocHost((size_t)(sizeof(type) * (count))) ) + +#define hypre_HostCTAlloc(type, count) \ +( (type *)hypre_CAllocHost((size_t)(count), (size_t)sizeof(type)) ) + +#define hypre_HostTReAlloc(ptr, type, count) \ +( (type *)hypre_ReAllocHost((char *)ptr, (size_t)(sizeof(type) * (count))) ) + +#define hypre_HostTFree(ptr) \ +( hypre_FreeHost((char *)ptr), ptr = NULL ) + /*-------------------------------------------------------------------------- * Prototypes *--------------------------------------------------------------------------*/ @@ -476,8 +655,13 @@ HYPRE_Int hypre_OutOfMemory ( size_t size ); char *hypre_MAlloc ( size_t size ); char *hypre_CAlloc ( size_t count , size_t elt_size ); +char *hypre_MAllocPinned( size_t size ); char *hypre_ReAlloc ( char *ptr , size_t size ); void hypre_Free ( char *ptr ); +char *hypre_CAllocHost( size_t count,size_t elt_size ); +char *hypre_MAllocHost( size_t size ); +char *hypre_ReAllocHost( char *ptr,size_t size ); +void hypre_FreeHost( char *ptr ); char *hypre_SharedMAlloc ( size_t size ); char *hypre_SharedCAlloc ( size_t count , size_t elt_size ); char *hypre_SharedReAlloc ( char *ptr , size_t size ); @@ -577,11 +761,12 @@ #ifndef HYPRE_TIMING #define hypre_InitializeTiming(name) 0 +#define hypre_FinalizeTiming(index) #define hypre_IncFLOPCount(inc) #define hypre_BeginTiming(i) #define hypre_EndTiming(i) #define hypre_PrintTiming(heading, comm) -#define hypre_FinalizeTiming(index) +#define hypre_ClearTiming() /*-------------------------------------------------------------------------- * With timing on @@ -747,11 +932,9 @@ } hypre_DataExchangeResponse; - HYPRE_Int hypre_CreateBinaryTree(HYPRE_Int, HYPRE_Int, hypre_BinaryTree*); HYPRE_Int hypre_DestroyBinaryTree(hypre_BinaryTree*); - HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts, HYPRE_Int *contact_proc_list, void *contact_send_buf, HYPRE_Int *contact_send_buf_starts, HYPRE_Int contact_obj_size, @@ -760,7 +943,6 @@ HYPRE_Int rnum, MPI_Comm comm, void **p_response_recv_buf, HYPRE_Int **p_response_recv_buf_starts); - #endif /* end of header */ /*BHEADER********************************************************************** @@ -840,9 +1022,365 @@ #endif /* CALIPER_INSTRUMENTATION_HEADER */ -/*-------------------------------------------------------------------------- - * Other prototypes - *--------------------------------------------------------------------------*/ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +//#define CUDAMEMATTACHTYPE cudaMemAttachGlobal +//#define CUDAMEMATTACHTYPE cudaMemAttachHost +#define HYPRE_GPU_USE_PINNED 1 +#define HYPRE_USE_MANAGED_SCALABLE 1 +#endif + +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#ifdef USE_NVTX +#include "nvToolsExt.h" +#include "nvToolsExtCudaRt.h" + +static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; +static const int num_colors = sizeof(colors)/sizeof(uint32_t); + +#define PUSH_RANGE(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \ +} + +#define PUSH_RANGE_PAYLOAD(name,cid,load) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_INT64; \ + eventAttrib.payload.llValue = load; \ + eventAttrib.category=1; \ + nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \ +} + +#define PUSH_RANGE_DOMAIN(name,cid,dId) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxDomainRangePushEx(getdomain(dId),&eventAttrib); \ +} + +#define POP_RANGE nvtxDomainRangePop(HYPRE_DOMAIN); +#define POP_RANGE_DOMAIN(dId) { \ + nvtxDomainRangePop(getdomain(dId)); \ + } +#else +#define PUSH_RANGE(name,cid) +#define POP_RANGE +#define PUSH_RANGE_PAYLOAD(name,cid,load) +#define PUSH_RANGE_DOMAIN(name,cid,domainName) +#endif + +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#ifdef HYPRE_USE_MANAGED +#include +#define CUDAMEMATTACHTYPE cudaMemAttachGlobal +#define MEM_PAD_LEN 1 +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file); + fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code)); + exit(2); + } +} +#define HYPRE_HOST_POINTER 0 +#define HYPRE_MANAGED_POINTER 1 +#define HYPRE_PINNED_POINTER 2 +#define HYPRE_DEVICE_POINTER 3 +#define HYPRE_UNDEFINED_POINTER1 4 +#define HYPRE_UNDEFINED_POINTER2 5 +void cudaSafeFree(void *ptr,int padding); +hypre_int PrintPointerAttributes(const void *ptr); +hypre_int PointerAttributes(const void *ptr); +#endif + +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +#ifndef __cusparseErrorCheck__ +#define __cusparseErrorCheck__ +#include +#include +#include +//#include +#include +inline const char *cusparseErrorCheck(cusparseStatus_t error) +{ + switch (error) + { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: + return "Unknown error in cusparseErrorCheck"; + } + +} +inline const char *cublasErrorCheck(cublasStatus_t error) +{ + switch (error) + { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + default: + return "Unknown error in cublasErrorCheck"; + } + +} +//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +//inline void gpuAssert(cudaError_t code, const char *file, int line) +//{ +// if (code != cudaSuccess) +// { +// fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file); +// fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code)); +// exit(2); +// } +//} +#define cusparseErrchk(ans) { cusparseAssert((ans), __FILE__, __LINE__); } +inline void cusparseAssert(cusparseStatus_t code, const char *file, int line) +{ + if (code != CUSPARSE_STATUS_SUCCESS) + { + fprintf(stderr,"CUSPARSE ERROR ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file); + fprintf(stderr,"CUSPARSE ERROR : %s \n", cusparseErrorCheck(code)); + } +} +#define cublasErrchk(ans){ cublasAssert((ans), __FILE__, __LINE__); } +inline void cublasAssert(cublasStatus_t code, const char *file, int line) +{ + if (code != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr,"CUBLAS ERROR ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file); + fprintf(stderr,"CUBLAS ERROR : %s \n", cublasErrorCheck(code)); + } +} +//int PointerType(const void *ptr); +void cudaSafeFree(void *ptr,int padding); +//void PrintPointerAttributes(const void *ptr); +//size_t mempush(void* ptr, size_t size,int purge); +//int memloc(void *ptr, int device); +#endif +#endif + +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED) +#ifndef __GPUMEM_H__ +#define __GPUMEM_H__ +#ifdef HYPRE_USE_GPU +#include +void hypre_GPUInit(hypre_int use_device); +void hypre_GPUFinalize(); +int VecScaleScalar(double *u, const double alpha, int num_rows,cudaStream_t s); +void VecCopy(double* tgt, const double* src, int size,cudaStream_t s); +void VecSet(double* tgt, int size, double value, cudaStream_t s); +void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s); +void CudaCompileFlagCheck(); +#endif + +cudaStream_t getstreamOlde(hypre_int i); +nvtxDomainHandle_t getdomain(hypre_int i); +cudaEvent_t getevent(hypre_int i); +void MemAdviseReadOnly(const void *ptr, hypre_int device); +void MemAdviseUnSetReadOnly(const void *ptr, hypre_int device); +void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device); +void MemAdviseSetPrefLocHost(const void *ptr); +void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream); +void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream); +void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream); +cublasHandle_t getCublasHandle(); +cusparseHandle_t getCusparseHandle(); +typedef struct node { + const void *ptr; + size_t size; + struct node *next; +} node; +size_t mempush(const void *ptr, size_t size, hypre_int action); +node *memfind(node *head, const void *ptr); +void memdel(node **head, node *found); +void meminsert(node **head, const void *ptr,size_t size); +void printlist(node *head,hypre_int nc); +//#define MEM_PAD_LEN 1 +size_t memsize(const void *ptr); +hypre_int getsetasyncmode(hypre_int mode, hypre_int action); +void SetAsyncMode(hypre_int mode); +hypre_int GetAsyncMode(); +void branchStream(hypre_int i, hypre_int j); +void joinStreams(hypre_int i, hypre_int j, hypre_int k); +void affs(hypre_int myid); +hypre_int getcore(); +hypre_int getnuma(); +hypre_int checkDeviceProps(); +hypre_int pointerIsManaged(const void *ptr); +/* + * Global struct for keeping HYPRE GPU Init state + */ + +#define MAX_HGS_ELEMENTS 10 +struct hypre__global_struct{ + hypre_int initd; + hypre_int device; + hypre_int device_count; + cublasHandle_t cublas_handle; + cusparseHandle_t cusparse_handle; + cusparseMatDescr_t cusparse_mat_descr; + cudaStream_t streams[MAX_HGS_ELEMENTS]; + nvtxDomainHandle_t nvtx_domain; + hypre_int concurrent_managed_access; + size_t memoryHWM; +}; + +extern struct hypre__global_struct hypre__global_handle ; + +/* + * Macros for accessing elements of the global handle + */ +#define HYPRE_GPU_HANDLE hypre__global_handle.initd +#define HYPRE_CUBLAS_HANDLE hypre__global_handle.cublas_handle +#define HYPRE_CUSPARSE_HANDLE hypre__global_handle.cusparse_handle +#define HYPRE_DEVICE hypre__global_handle.device +#define HYPRE_DEVICE_COUNT hypre__global_handle.device_count +#define HYPRE_CUSPARSE_MAT_DESCR hypre__global_handle.cusparse_mat_descr +#define HYPRE_STREAM(index) (hypre__global_handle.streams[index]) +#define HYPRE_DOMAIN hypre__global_handle.nvtx_domain +#define HYPRE_GPU_CMA hypre__global_handle.concurrent_managed_access +#define HYPRE_GPU_HWM hypre__global_handle.memoryHWM + +#endif + +#else + +#define hypre_GPUInit(use_device) +#define hypre_GPUFinalize() + +#endif + +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ /* amg_linklist.c */ void hypre_dispose_elt ( hypre_LinkList element_ptr ); @@ -869,21 +1407,21 @@ #endif /* hypre_printf.c */ -#ifdef HYPRE_BIGINT +// #ifdef HYPRE_BIGINT HYPRE_Int hypre_printf( const char *format , ... ); HYPRE_Int hypre_fprintf( FILE *stream , const char *format, ... ); HYPRE_Int hypre_sprintf( char *s , const char *format, ... ); HYPRE_Int hypre_scanf( const char *format , ... ); HYPRE_Int hypre_fscanf( FILE *stream , const char *format, ... ); HYPRE_Int hypre_sscanf( char *s , const char *format, ... ); -#else -#define hypre_printf printf -#define hypre_fprintf fprintf -#define hypre_sprintf sprintf -#define hypre_scanf scanf -#define hypre_fscanf fscanf -#define hypre_sscanf sscanf -#endif +// #else +// #define hypre_printf printf +// #define hypre_fprintf fprintf +// #define hypre_sprintf sprintf +// #define hypre_scanf scanf +// #define hypre_fscanf fscanf +// #define hypre_sscanf sscanf +// #endif /* hypre_qsort.c */ void hypre_swap ( HYPRE_Int *v , HYPRE_Int i , HYPRE_Int j ); @@ -906,8 +1444,9 @@ HYPRE_Int hypre_DoubleQuickSplit ( HYPRE_Real *values , HYPRE_Int *indices , HYPRE_Int list_length , HYPRE_Int NumberKept ); /* random.c */ -void hypre_SeedRand ( HYPRE_Int seed ); -HYPRE_Real hypre_Rand ( void ); +HYPRE_CUDA_GLOBAL void hypre_SeedRand ( HYPRE_Int seed ); +HYPRE_CUDA_GLOBAL HYPRE_Int hypre_RandI ( void ); +HYPRE_CUDA_GLOBAL HYPRE_Real hypre_Rand ( void ); /* hypre_prefix_sum.c */ /** @@ -1057,7 +1596,5 @@ } #endif -/*#include "hypre_hopscotch_hash.h"*/ - #endif diff -Nru hypre-2.11.2/src/utilities/HYPRE_utilities.h hypre-2.13.0/src/utilities/HYPRE_utilities.h --- hypre-2.11.2/src/utilities/HYPRE_utilities.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/HYPRE_utilities.h 2017-10-20 17:42:22.000000000 +0000 @@ -43,25 +43,51 @@ * Big int stuff *--------------------------------------------------------------------------*/ -#ifdef HYPRE_BIGINT +#if defined(HYPRE_BIGINT) typedef long long int HYPRE_Int; #define HYPRE_MPI_INT MPI_LONG_LONG_INT -#else + +#else /* default */ typedef int HYPRE_Int; #define HYPRE_MPI_INT MPI_INT #endif /*-------------------------------------------------------------------------- - * Complex stuff + * Real and Complex types *--------------------------------------------------------------------------*/ +#include + +#if defined(HYPRE_SINGLE) +typedef float HYPRE_Real; +#define HYPRE_REAL_MAX FLT_MAX +#define HYPRE_REAL_MIN FLT_MIN +#define HYPRE_REAL_EPSILON FLT_EPSILON +#define HYPRE_REAL_MIN_EXP FLT_MIN_EXP +#define HYPRE_MPI_REAL MPI_FLOAT + +#elif defined(HYPRE_LONG_DOUBLE) +typedef long double HYPRE_Real; +#define HYPRE_REAL_MAX LDBL_MAX +#define HYPRE_REAL_MIN LDBL_MIN +#define HYPRE_REAL_EPSILON LDBL_EPSILON +#define HYPRE_REAL_MIN_EXP DBL_MIN_EXP +#define HYPRE_MPI_REAL MPI_LONG_DOUBLE + +#else /* default */ typedef double HYPRE_Real; +#define HYPRE_REAL_MAX DBL_MAX +#define HYPRE_REAL_MIN DBL_MIN +#define HYPRE_REAL_EPSILON DBL_EPSILON +#define HYPRE_REAL_MIN_EXP DBL_MIN_EXP #define HYPRE_MPI_REAL MPI_DOUBLE +#endif -#ifdef HYPRE_COMPLEX +#if defined(HYPRE_COMPLEX) typedef double _Complex HYPRE_Complex; #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX /* or MPI_LONG_DOUBLE ? */ -#else + +#else /* default */ typedef HYPRE_Real HYPRE_Complex; #define HYPRE_MPI_COMPLEX HYPRE_MPI_REAL #endif diff -Nru hypre-2.11.2/src/utilities/Makefile hypre-2.13.0/src/utilities/Makefile --- hypre-2.11.2/src/utilities/Makefile 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/Makefile 2017-10-20 17:42:22.000000000 +0000 @@ -60,7 +60,9 @@ F90_HYPRE_error.c\ hypre_prefix_sum.c\ hypre_merge_sort.c\ - hypre_hopscotch_hash.c + hypre_hopscotch_hash.c\ + gpuErrorCheck.c\ + gpuMem.c OBJS = ${FILES:.c=.o} diff -Nru hypre-2.11.2/src/utilities/mpistubs.c hypre-2.13.0/src/utilities/mpistubs.c --- hypre-2.11.2/src/utilities/mpistubs.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/mpistubs.c 2017-10-20 17:42:22.000000000 +0000 @@ -25,7 +25,7 @@ #ifdef HYPRE_HAVE_MPI_COMM_F2C return (hypre_MPI_Comm) MPI_Comm_f2c(comm); #else - return (hypre_MPI_Comm) comm; + return (hypre_MPI_Comm) (size_t)comm; #endif } diff -Nru hypre-2.11.2/src/utilities/mpistubs.h hypre-2.13.0/src/utilities/mpistubs.h --- hypre-2.11.2/src/utilities/mpistubs.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/mpistubs.h 2017-10-20 17:42:22.000000000 +0000 @@ -46,10 +46,13 @@ #define MPI_COMM_WORLD hypre_MPI_COMM_WORLD #define MPI_COMM_NULL hypre_MPI_COMM_NULL +#define MPI_COMM_SELF hypre_MPI_COMM_SELF #define MPI_BOTTOM hypre_MPI_BOTTOM +#define MPI_FLOAT hypre_MPI_FLOAT #define MPI_DOUBLE hypre_MPI_DOUBLE +#define MPI_LONG_DOUBLE hypre_MPI_LONG_DOUBLE #define MPI_INT hypre_MPI_INT #define MPI_LONG_LONG_INT hypre_MPI_INT #define MPI_CHAR hypre_MPI_CHAR @@ -61,6 +64,9 @@ #define MPI_MIN hypre_MPI_MIN #define MPI_MAX hypre_MPI_MAX #define MPI_LOR hypre_MPI_LOR +#define MPI_LAND hypre_MPI_LAND +#define MPI_SUCCESS hypre_MPI_SUCCESS +#define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE #define MPI_UNDEFINED hypre_MPI_UNDEFINED #define MPI_REQUEST_NULL hypre_MPI_REQUEST_NULL @@ -68,8 +74,6 @@ #define MPI_ANY_TAG hypre_MPI_ANY_TAG #define MPI_SOURCE hypre_MPI_SOURCE #define MPI_TAG hypre_MPI_TAG -#define MPI_SUCCESS hypre_MPI_SUCCESS -#define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE #define MPI_Init hypre_MPI_Init #define MPI_Finalize hypre_MPI_Finalize @@ -93,9 +97,9 @@ #define MPI_Allgather hypre_MPI_Allgather #define MPI_Allgatherv hypre_MPI_Allgatherv #define MPI_Gather hypre_MPI_Gather -#define MPI_Gatherv hypre_MPI_Gatherv +#define MPI_Gatherv hypre_MPI_Gatherv #define MPI_Scatter hypre_MPI_Scatter -#define MPI_Scatterv hypre_MPI_Scatterv +#define MPI_Scatterv hypre_MPI_Scatterv #define MPI_Bcast hypre_MPI_Bcast #define MPI_Send hypre_MPI_Send #define MPI_Recv hypre_MPI_Recv @@ -122,6 +126,9 @@ #define MPI_Type_struct hypre_MPI_Type_struct #define MPI_Type_commit hypre_MPI_Type_commit #define MPI_Type_free hypre_MPI_Type_free +#define MPI_Op_free hypre_MPI_Op_free +#define MPI_Op_create hypre_MPI_Op_create +#define MPI_User_function hypre_MPI_User_function /*-------------------------------------------------------------------------- * Types, etc. @@ -132,6 +139,7 @@ typedef HYPRE_Int hypre_MPI_Group; typedef HYPRE_Int hypre_MPI_Request; typedef HYPRE_Int hypre_MPI_Datatype; +typedef void (hypre_MPI_User_function) (); typedef struct { @@ -141,23 +149,27 @@ typedef HYPRE_Int hypre_MPI_Op; typedef HYPRE_Int hypre_MPI_Aint; +#define hypre_MPI_COMM_SELF 1 #define hypre_MPI_COMM_WORLD 0 #define hypre_MPI_COMM_NULL -1 #define hypre_MPI_BOTTOM 0x0 -#define hypre_MPI_DOUBLE 0 -#define hypre_MPI_INT 1 -#define hypre_MPI_CHAR 2 -#define hypre_MPI_LONG 3 -#define hypre_MPI_BYTE 4 -#define hypre_MPI_REAL 5 -#define hypre_MPI_COMPLEX 6 +#define hypre_MPI_FLOAT 0 +#define hypre_MPI_DOUBLE 1 +#define hypre_MPI_LONG_DOUBLE 2 +#define hypre_MPI_INT 3 +#define hypre_MPI_CHAR 4 +#define hypre_MPI_LONG 5 +#define hypre_MPI_BYTE 6 +#define hypre_MPI_REAL 7 +#define hypre_MPI_COMPLEX 8 #define hypre_MPI_SUM 0 #define hypre_MPI_MIN 1 #define hypre_MPI_MAX 2 #define hypre_MPI_LOR 3 +#define hypre_MPI_LAND 4 #define hypre_MPI_SUCCESS 0 #define hypre_MPI_STATUSES_IGNORE 0 @@ -179,13 +191,16 @@ typedef MPI_Status hypre_MPI_Status; typedef MPI_Op hypre_MPI_Op; typedef MPI_Aint hypre_MPI_Aint; +typedef MPI_User_function hypre_MPI_User_function; #define hypre_MPI_COMM_WORLD MPI_COMM_WORLD #define hypre_MPI_COMM_NULL MPI_COMM_NULL #define hypre_MPI_BOTTOM MPI_BOTTOM -#define hypre_MPI_SUCCESS MPI_SUCCESS +#define hypre_MPI_COMM_SELF MPI_COMM_SELF +#define hypre_MPI_FLOAT MPI_FLOAT #define hypre_MPI_DOUBLE MPI_DOUBLE +#define hypre_MPI_LONG_DOUBLE MPI_LONG_DOUBLE /* HYPRE_MPI_INT is defined in HYPRE_utilities.h */ #define hypre_MPI_INT HYPRE_MPI_INT #define hypre_MPI_CHAR MPI_CHAR @@ -200,6 +215,8 @@ #define hypre_MPI_MIN MPI_MIN #define hypre_MPI_MAX MPI_MAX #define hypre_MPI_LOR MPI_LOR +#define hypre_MPI_SUCCESS MPI_SUCCESS +#define hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE #define hypre_MPI_UNDEFINED MPI_UNDEFINED #define hypre_MPI_REQUEST_NULL MPI_REQUEST_NULL @@ -207,7 +224,6 @@ #define hypre_MPI_ANY_TAG MPI_ANY_TAG #define hypre_MPI_SOURCE MPI_SOURCE #define hypre_MPI_TAG MPI_TAG -#define hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE #define hypre_MPI_LAND MPI_LAND #endif @@ -272,9 +288,12 @@ HYPRE_Int hypre_MPI_Type_struct( HYPRE_Int count , HYPRE_Int *array_of_blocklengths , hypre_MPI_Aint *array_of_displacements , hypre_MPI_Datatype *array_of_types , hypre_MPI_Datatype *newtype ); HYPRE_Int hypre_MPI_Type_commit( hypre_MPI_Datatype *datatype ); HYPRE_Int hypre_MPI_Type_free( hypre_MPI_Datatype *datatype ); +HYPRE_Int hypre_MPI_Op_free( hypre_MPI_Op *op ); +HYPRE_Int hypre_MPI_Op_create( hypre_MPI_User_function *function , hypre_int commute , hypre_MPI_Op *op ); #ifdef __cplusplus } #endif #endif + diff -Nru hypre-2.11.2/src/utilities/protos.h hypre-2.13.0/src/utilities/protos.h --- hypre-2.11.2/src/utilities/protos.h 1970-01-01 00:00:00.000000000 +0000 +++ hypre-2.13.0/src/utilities/protos.h 2017-10-20 17:42:22.000000000 +0000 @@ -0,0 +1,221 @@ +/*BHEADER********************************************************************** + * Copyright (c) 2008, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * This file is part of HYPRE. See file COPYRIGHT for details. + * + * HYPRE is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License (as published by the Free + * Software Foundation) version 2.1 dated February 1999. + * + * $Revision$ + ***********************************************************************EHEADER*/ + +/* amg_linklist.c */ +void hypre_dispose_elt ( hypre_LinkList element_ptr ); +void hypre_remove_point ( hypre_LinkList *LoL_head_ptr , hypre_LinkList *LoL_tail_ptr , HYPRE_Int measure , HYPRE_Int index , HYPRE_Int *lists , HYPRE_Int *where ); +hypre_LinkList hypre_create_elt ( HYPRE_Int Item ); +void hypre_enter_on_lists ( hypre_LinkList *LoL_head_ptr , hypre_LinkList *LoL_tail_ptr , HYPRE_Int measure , HYPRE_Int index , HYPRE_Int *lists , HYPRE_Int *where ); + +/* binsearch.c */ +HYPRE_Int hypre_BinarySearch ( HYPRE_Int *list , HYPRE_Int value , HYPRE_Int list_length ); +HYPRE_Int hypre_BinarySearch2 ( HYPRE_Int *list , HYPRE_Int value , HYPRE_Int low , HYPRE_Int high , HYPRE_Int *spot ); +HYPRE_Int *hypre_LowerBound( HYPRE_Int *first, HYPRE_Int *last, HYPRE_Int value ); + +/* hypre_complex.c */ +#ifdef HYPRE_COMPLEX +HYPRE_Complex hypre_conj( HYPRE_Complex value ); +HYPRE_Real hypre_cabs( HYPRE_Complex value ); +HYPRE_Real hypre_creal( HYPRE_Complex value ); +HYPRE_Real hypre_cimag( HYPRE_Complex value ); +#else +#define hypre_conj(value) value +#define hypre_cabs(value) fabs(value) +#define hypre_creal(value) value +#define hypre_cimag(value) 0.0 +#endif + +/* hypre_printf.c */ +// #ifdef HYPRE_BIGINT +HYPRE_Int hypre_printf( const char *format , ... ); +HYPRE_Int hypre_fprintf( FILE *stream , const char *format, ... ); +HYPRE_Int hypre_sprintf( char *s , const char *format, ... ); +HYPRE_Int hypre_scanf( const char *format , ... ); +HYPRE_Int hypre_fscanf( FILE *stream , const char *format, ... ); +HYPRE_Int hypre_sscanf( char *s , const char *format, ... ); +// #else +// #define hypre_printf printf +// #define hypre_fprintf fprintf +// #define hypre_sprintf sprintf +// #define hypre_scanf scanf +// #define hypre_fscanf fscanf +// #define hypre_sscanf sscanf +// #endif + +/* hypre_qsort.c */ +void hypre_swap ( HYPRE_Int *v , HYPRE_Int i , HYPRE_Int j ); +void hypre_swap2 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int i , HYPRE_Int j ); +void hypre_swap2i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int i , HYPRE_Int j ); +void hypre_swap3i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int i , HYPRE_Int j ); +void hypre_swap3_d ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int i , HYPRE_Int j ); +void hypre_swap4_d ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int *y , HYPRE_Int i , HYPRE_Int j ); +void hypre_swap_d ( HYPRE_Real *v , HYPRE_Int i , HYPRE_Int j ); +void hypre_qsort0 ( HYPRE_Int *v , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort1 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort2i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort2 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort3i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort3_abs ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort4_abs ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int *y , HYPRE_Int left , HYPRE_Int right ); +void hypre_qsort_abs ( HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right ); + +/* qsplit.c */ +HYPRE_Int hypre_DoubleQuickSplit ( HYPRE_Real *values , HYPRE_Int *indices , HYPRE_Int list_length , HYPRE_Int NumberKept ); + +/* random.c */ +HYPRE_CUDA_GLOBAL void hypre_SeedRand ( HYPRE_Int seed ); +HYPRE_CUDA_GLOBAL HYPRE_Int hypre_RandI ( void ); +HYPRE_CUDA_GLOBAL HYPRE_Real hypre_Rand ( void ); + +/* hypre_prefix_sum.c */ +/** + * Assumed to be called within an omp region. + * Let x_i be the input of ith thread. + * The output of ith thread y_i = x_0 + x_1 + ... + x_{i-1} + * Additionally, sum = x_0 + x_1 + ... + x_{nthreads - 1} + * Note that always y_0 = 0 + * + * @param workspace at least with length (nthreads+1) + * workspace[tid] will contain result for tid + * workspace[nthreads] will contain sum + */ +void hypre_prefix_sum(HYPRE_Int *in_out, HYPRE_Int *sum, HYPRE_Int *workspace); +/** + * This version does prefix sum in pair. + * Useful when we prefix sum of diag and offd in tandem. + * + * @param worksapce at least with length 2*(nthreads+1) + * workspace[2*tid] and workspace[2*tid+1] will contain results for tid + * workspace[3*nthreads] and workspace[3*nthreads + 1] will contain sums + */ +void hypre_prefix_sum_pair(HYPRE_Int *in_out1, HYPRE_Int *sum1, HYPRE_Int *in_out2, HYPRE_Int *sum2, HYPRE_Int *workspace); +/** + * @param workspace at least with length 3*(nthreads+1) + * workspace[3*tid:3*tid+3) will contain results for tid + */ +void hypre_prefix_sum_triple(HYPRE_Int *in_out1, HYPRE_Int *sum1, HYPRE_Int *in_out2, HYPRE_Int *sum2, HYPRE_Int *in_out3, HYPRE_Int *sum3, HYPRE_Int *workspace); + +/** + * n prefix-sums together. + * workspace[n*tid:n*(tid+1)) will contain results for tid + * workspace[nthreads*tid:nthreads*(tid+1)) will contain sums + * + * @param workspace at least with length n*(nthreads+1) + */ +void hypre_prefix_sum_multiple(HYPRE_Int *in_out, HYPRE_Int *sum, HYPRE_Int n, HYPRE_Int *workspace); + +/* hypre_merge_sort.c */ +/** + * Why merge sort? + * 1) Merge sort can take advantage of eliminating duplicates. + * 2) Merge sort is more efficiently parallelizable than qsort + */ + +/** + * Out of place merge sort with duplicate elimination + * @ret number of unique elements + */ +HYPRE_Int hypre_merge_sort_unique(HYPRE_Int *in, HYPRE_Int *out, HYPRE_Int len); +/** + * Out of place merge sort with duplicate elimination + * + * @param out pointer to output can be in or temp + * @ret number of unique elements + */ +HYPRE_Int hypre_merge_sort_unique2(HYPRE_Int *in, HYPRE_Int *temp, HYPRE_Int len, HYPRE_Int **out); + +void hypre_merge_sort(HYPRE_Int *in, HYPRE_Int *temp, HYPRE_Int len, HYPRE_Int **sorted); + +/* hypre_hopscotch_hash.c */ + +#ifdef HYPRE_USING_OPENMP + +/* Check if atomic operations are available to use concurrent hopscotch hash table */ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) && (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40100 +#define HYPRE_USING_ATOMIC +//#elif defined _MSC_VER // JSP: haven't tested, so comment out for now +//#define HYPRE_USING_ATOMIC +//#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__) +// JSP: not many compilers have implemented this, so comment out for now +//#define HYPRE_USING_ATOMIC +//#include +#endif + +#endif // HYPRE_USING_OPENMP + +#ifdef HYPRE_HOPSCOTCH +#ifdef HYPRE_USING_ATOMIC +// concurrent hopscotch hashing is possible only with atomic supports +#define HYPRE_CONCURRENT_HOPSCOTCH +#endif +#endif + +#ifdef HYPRE_CONCURRENT_HOPSCOTCH +typedef struct { + HYPRE_Int volatile timestamp; + omp_lock_t lock; +} hypre_HopscotchSegment; +#endif + +/** + * The current typical use case of unordered set is putting input sequence + * with lots of duplication (putting all colidx received from other ranks), + * followed by one sweep of enumeration. + * Since the capacity is set to the number of inputs, which is much larger + * than the number of unique elements, we optimize for initialization and + * enumeration whose time is proportional to the capacity. + * For initialization and enumeration, structure of array (SoA) is better + * for vectorization, cache line utilization, and so on. + */ +typedef struct +{ + HYPRE_Int volatile segmentMask; + HYPRE_Int volatile bucketMask; +#ifdef HYPRE_CONCURRENT_HOPSCOTCH + hypre_HopscotchSegment* volatile segments; +#endif + HYPRE_Int *volatile key; + hypre_uint *volatile hopInfo; + HYPRE_Int *volatile hash; +} hypre_UnorderedIntSet; + +typedef struct +{ + hypre_uint volatile hopInfo; + HYPRE_Int volatile hash; + HYPRE_Int volatile key; + HYPRE_Int volatile data; +} hypre_HopscotchBucket; + +/** + * The current typical use case of unoredered map is putting input sequence + * with no duplication (inverse map of a bijective mapping) followed by + * lots of lookups. + * For lookup, array of structure (AoS) gives better cache line utilization. + */ +typedef struct +{ + HYPRE_Int volatile segmentMask; + HYPRE_Int volatile bucketMask; +#ifdef HYPRE_CONCURRENT_HOPSCOTCH + hypre_HopscotchSegment* volatile segments; +#endif + hypre_HopscotchBucket* volatile table; +} hypre_UnorderedIntMap; + +/** + * Sort array "in" with length len and put result in array "out" + * "in" will be deallocated unless in == *out + * inverse_map is an inverse hash table s.t. inverse_map[i] = j iff (*out)[j] = i + */ +void hypre_sort_and_create_inverse_map( + HYPRE_Int *in, HYPRE_Int len, HYPRE_Int **out, hypre_UnorderedIntMap *inverse_map); diff -Nru hypre-2.11.2/src/utilities/random.c hypre-2.13.0/src/utilities/random.c --- hypre-2.11.2/src/utilities/random.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/random.c 2017-10-20 17:42:22.000000000 +0000 @@ -38,25 +38,39 @@ #include "_hypre_utilities.h" -/*-------------------------------------------------------------------------- - * Static variables - *--------------------------------------------------------------------------*/ - +#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED) +__managed__ __device__ +#endif + +/*------------------------------------------------------------------------------- + * Static global variable: Seed + * ``... all initial seeds between 1 and 2147483646 (2^31-2) are equally valid'' + *-------------------------------------------------------------------------------*/ static HYPRE_Int Seed = 13579; -#define a 16807 -#define m 2147483647 -#define q 127773 -#define r 2836 +#define a 16807 /* 7^5 */ +#define m 2147483647 /* 2*31 - 1 */ +#define q 127773 /* m div a */ +#define r 2836 /* m mod a */ /*-------------------------------------------------------------------------- * Initializes the pseudo-random number generator to a place in the sequence. * * @param seed an HYPRE_Int containing the seed for the RNG. *--------------------------------------------------------------------------*/ - +HYPRE_CUDA_GLOBAL void hypre_SeedRand( HYPRE_Int seed ) { + /* RL: seed must be between 1 and 2^31-2 */ + if (seed < 1) + { + seed = 1; + } + else if (seed >= m) + { + seed = m - 1; + } + Seed = seed; } @@ -64,11 +78,10 @@ * Computes the next pseudo-random number in the sequence using the global * variable Seed. * - * @return a HYPRE_Real containing the next number in the sequence divided by - * 2147483647 so that the numbers are in (0, 1]. + * @return a HYPRE_Int between (0, 2147483647] *--------------------------------------------------------------------------*/ - -HYPRE_Real hypre_Rand() +HYPRE_CUDA_GLOBAL +HYPRE_Int hypre_RandI() { HYPRE_Int low, high, test; @@ -84,5 +97,19 @@ Seed = test + m; } - return ((HYPRE_Real)(Seed) / m); + return Seed; } + +/*-------------------------------------------------------------------------- + * Computes the next pseudo-random number in the sequence using the global + * variable Seed. + * + * @return a HYPRE_Real containing the next number in the sequence divided by + * 2147483647 so that the numbers are in (0, 1]. + *--------------------------------------------------------------------------*/ +HYPRE_CUDA_GLOBAL +HYPRE_Real hypre_Rand() +{ + return ((HYPRE_Real)(hypre_RandI()) / m); +} + diff -Nru hypre-2.11.2/src/utilities/threading.h hypre-2.13.0/src/utilities/threading.h --- hypre-2.11.2/src/utilities/threading.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/threading.h 2017-10-20 17:42:22.000000000 +0000 @@ -30,3 +30,4 @@ void hypre_GetSimpleThreadPartition( HYPRE_Int *begin, HYPRE_Int *end, HYPRE_Int n ); #endif + diff -Nru hypre-2.11.2/src/utilities/timing.c hypre-2.13.0/src/utilities/timing.c --- hypre-2.11.2/src/utilities/timing.c 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/timing.c 2017-10-20 17:42:22.000000000 +0000 @@ -328,9 +328,9 @@ local_wall_time = hypre_TimingWallTime(i); local_cpu_time = hypre_TimingCPUTime(i); hypre_MPI_Allreduce(&local_wall_time, &wall_time, 1, - hypre_MPI_DOUBLE, hypre_MPI_MAX, comm); + hypre_MPI_REAL, hypre_MPI_MAX, comm); hypre_MPI_Allreduce(&local_cpu_time, &cpu_time, 1, - hypre_MPI_DOUBLE, hypre_MPI_MAX, comm); + hypre_MPI_REAL, hypre_MPI_MAX, comm); if (myrank == 0) { diff -Nru hypre-2.11.2/src/utilities/timing.h hypre-2.13.0/src/utilities/timing.h --- hypre-2.11.2/src/utilities/timing.h 2017-03-13 19:37:24.000000000 +0000 +++ hypre-2.13.0/src/utilities/timing.h 2017-10-20 17:42:22.000000000 +0000 @@ -48,8 +48,8 @@ #define hypre_IncFLOPCount(inc) #define hypre_BeginTiming(i) #define hypre_EndTiming(i) -#define hypre_ClearTiming() #define hypre_PrintTiming(heading, comm) +#define hypre_ClearTiming() /*-------------------------------------------------------------------------- * With timing on @@ -119,3 +119,4 @@ #endif #endif +